If I run this code locally it works. On Cloud Run, I get "400 Request contains an invalid argument".
As side notes:
- The input file path is a temporary file obtained using the get_file_path function (e.g. /tmp/20240524_1011_470558_393480997312_ME7001b9a160047278515b83ff123c7545.jpg)
- Env variables are the same locally and on Cloud Run
- Assign mime type correctly returns "image/jpeg"
How do I fix it?
def get_file_path(filename):
"""Create a secure version of the filename
and return a full path to temporary directory"""
file_name = secure_filename(filename)
return os.path.join(tempfile.gettempdir(), file_name)
def process_document_sample(
project_id: str,
location: str,
processor_id: str,
file_path: str,
mime_type: str,
field_mask: Optional[str] = None,
processor_version_id: Optional[str] = None,
) -> None:
# You must set the `api_endpoint` if you use a location other than "us".
opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
client = documentai.DocumentProcessorServiceClient.from_service_account_json(
config.Config.OCR_GCP_KEY_PATH, client_options=opts
)
logger.debug("Document AI Client correctly launched.")
if processor_version_id:
# The full resource name of the processor version, e.g.:
# `projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}`
name = client.processor_version_path(
project_id, location, processor_id, processor_version_id
)
else:
# The full resource name of the processor, e.g.:
# `projects/{project_id}/locations/{location}/processors/{processor_id}`
name = client.processor_path(project_id, location, processor_id)
# Read the file into memory
with open(file_path, "rb") as image:
image_content = image.read()
logger.debug("Local image correctly loaded.")
# Load binary data
raw_document = documentai.RawDocument(content=image_content, mime_type=mime_type)
# For more information: https://cloud.google.com/document-ai/docs/reference/rest/v1/ProcessOptions
# Optional: Additional configurations for processing.
process_options = documentai.ProcessOptions(
# Process only specific pages
individual_page_selector=documentai.ProcessOptions.IndividualPageSelector(
pages=[1]
)
)
# Configure the process request
request = documentai.ProcessRequest(
name=name,
raw_document=raw_document,
field_mask=field_mask,
process_options=process_options,
)
result = client.process_document(request=request)
# For a full list of `Document` object attributes, reference this page:
# https://cloud.google.com/document-ai/docs/reference/rest/v1/Document
document = result.document
# Read the text recognition output from the processor
logger.debug("The document contains the following text:")
logger.debug(document.text)
return document
document = process_document_sample(
project_id=os.environ["GCP_PROJECT_ID"],
location=os.environ["PROCESSOR_LOCATION"],
processor_id=os.environ["PROCESSOR_ID"],
file_path=file_path,
mime_type=_assign_mime_type(file_path),
)
from_service_account_json
and instead use Application Default Credentials. It's not your biggest issue but it would make your code safer and more flexible.content=image_content
appears incorrect. This value should be base64-encoded (see RawDocument)