Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OCR Block v2 #706

Draft
wants to merge 8 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion docker/dockerfiles/Dockerfile.onnx.cpu
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ RUN pip3 install --upgrade pip && pip3 install \
-r requirements.transformers.txt \
jupyterlab \
wheel>=0.38.0 \
setuptools>=65.5.1 \
--upgrade \
&& rm -rf ~/.cache/pip

Expand Down Expand Up @@ -74,4 +75,5 @@ ENV API_LOGGING_ENABLED=True
ENV CORE_MODEL_SAM2_ENABLED=True
ENV CORE_MODEL_OWLV2_ENABLED=True

ENTRYPOINT uvicorn cpu_http:app --workers $NUM_WORKERS --host $HOST --port $PORT
RUN pip install watchdog[watchmedo]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ENTRYPOINT watchmedo auto-restart --directory=/app/inference --pattern=*.py --recursive -- uvicorn cpu_http:app --workers $NUM_WORKERS --host $HOST --port $PORT
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from abc import ABC, abstractmethod
from typing import Callable, List

from inference.core.workflows.core_steps.common.entities import (
StepExecutionMode,
)
from inference.core.workflows.execution_engine.entities.base import (
Batch,
WorkflowImageData,
)
from inference.core.workflows.prototypes.block import BlockResult


class BaseOCRModel(ABC):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please do not share common base class for OCR blocks


def __init__(self, model_manager, api_key):
self.model_manager = model_manager
self.api_key = api_key

@abstractmethod
def run(
self,
images: Batch[WorkflowImageData],
step_execution_mode: StepExecutionMode,
post_process_result: Callable[
[Batch[WorkflowImageData], List[dict]], BlockResult
],
) -> BlockResult:
pass
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from inference.core.entities.requests.doctr import DoctrOCRInferenceRequest
from inference.core.workflows.core_steps.common.entities import (
StepExecutionMode,
)
from inference.core.workflows.core_steps.common.utils import load_core_model
from inference.core.workflows.execution_engine.entities.base import (
Batch,
WorkflowImageData,
)
from inference.core.workflows.prototypes.block import BlockResult
from typing import Callable, List

from .base import BaseOCRModel


class DoctrOCRModel(BaseOCRModel):

def run(
self,
images: Batch[WorkflowImageData],
step_execution_mode: StepExecutionMode,
post_process_result: Callable[
[Batch[WorkflowImageData], List[dict]], BlockResult
],
) -> BlockResult:
if step_execution_mode is StepExecutionMode.LOCAL:
return self.run_locally(images, post_process_result)
elif step_execution_mode is StepExecutionMode.REMOTE:
return self.run_remotely(images, post_process_result)

def run_locally(
self,
images: Batch[WorkflowImageData],
post_process_result: Callable[
[Batch[WorkflowImageData], List[dict]], BlockResult
],
) -> BlockResult:
predictions = []
for single_image in images:
inference_request = DoctrOCRInferenceRequest(
image=single_image.to_inference_format(numpy_preferred=True),
api_key=self.api_key,
)
doctr_model_id = load_core_model(
model_manager=self.model_manager,
inference_request=inference_request,
core_model="doctr",
)
result = self.model_manager.infer_from_request_sync(
doctr_model_id, inference_request
)
predictions.append(result.model_dump())
return post_process_result(images, predictions)

def run_remotely(
self,
images: Batch[WorkflowImageData],
post_process_result: Callable[
[Batch[WorkflowImageData], List[dict]], BlockResult
],
) -> BlockResult:
raise NotImplementedError(
"Remote execution is not implemented for DoctrOCRModel."
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# models/google_cloud_vision.py

from .base import BaseOCRModel
from inference.core.workflows.core_steps.common.entities import (
StepExecutionMode,
)
from inference.core.workflows.execution_engine.entities.base import (
Batch,
WorkflowImageData,
)
from typing import Optional
import requests


class GoogleCloudVisionOCRModel(BaseOCRModel):
def __init__(
self, model_manager, api_key: Optional[str], google_cloud_api_key: str
):
super().__init__(model_manager, api_key)
self.google_cloud_api_key = google_cloud_api_key

def run(
self,
images: Batch[WorkflowImageData],
step_execution_mode: StepExecutionMode,
post_process_result,
):
predictions = []
for image_data in images:
encoded_image = image_data.base64_image
url = (
f"https://vision.googleapis.com/v1/images:annotate"
f"?key={self.google_cloud_api_key}"
)

payload = {
"requests": [
{
"image": {"content": encoded_image},
"features": [{"type": "TEXT_DETECTION"}],
}
]
}
# Send the request
response = requests.post(url, json=payload)
if response.status_code == 200:
result = response.json()
text_annotations = result["responses"][0].get(
"textAnnotations",
[],
)
if text_annotations:
text = text_annotations[0]["description"]
else:
text = ""
else:
error_info = response.json().get("error", {})
message = error_info.get("message", response.text)
raise Exception(
f"Google Cloud Vision API request failed: {message}",
)
prediction = {"result": text}
predictions.append(prediction)
return post_process_result(images, predictions)
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
from .base import BaseOCRModel
from inference.core.workflows.core_steps.common.entities import (
StepExecutionMode,
)
from inference.core.workflows.execution_engine.entities.base import (
Batch,
WorkflowImageData,
)
from typing import Optional, List, Callable
from inference.core.workflows.prototypes.block import BlockResult

import requests
import json
import base64


class MathpixOCRModel(BaseOCRModel):
def __init__(
self,
model_manager,
api_key: Optional[str],
mathpix_app_id: str,
mathpix_app_key: str,
):
super().__init__(model_manager, api_key)
self.mathpix_app_id = mathpix_app_id
self.mathpix_app_key = mathpix_app_key

def run(
self,
images: Batch[WorkflowImageData],
step_execution_mode: StepExecutionMode,
post_process_result: Callable[
[Batch[WorkflowImageData], List[dict]], BlockResult
],
) -> BlockResult:
predictions = []
for image_data in images:
# Decode base64 image to bytes
image_bytes = base64.b64decode(image_data.base64_image)

# Prepare the request
url = "https://api.mathpix.com/v3/text"
headers = {
"app_id": self.mathpix_app_id,
"app_key": self.mathpix_app_key,
}
data = {
"options_json": json.dumps(
{
"math_inline_delimiters": ["$", "$"],
"rm_spaces": True,
}
)
}
files = {"file": ("image.jpg", image_bytes, "image/jpeg")}

# Send the request
response = requests.post(
url,
headers=headers,
data=data,
files=files,
)

if response.status_code == 200:
result = response.json()
# Extract the text result
text = result.get("text", "")
else:
error_info = response.json().get("error", {})
message = error_info.get("message", response.text)
detailed_message = error_info.get("detail", "")

raise Exception(
f"Mathpix API request failed: {message} \n\n"
f"Detailed: {detailed_message}"
)

prediction = {"result": text}
predictions.append(prediction)

return post_process_result(images, predictions)
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from typing import Callable, List

from inference.core.entities.requests.trocr import TrOCRInferenceRequest
from inference.core.workflows.core_steps.common.entities import (
StepExecutionMode,
)
from inference.core.workflows.core_steps.common.utils import load_core_model
from inference.core.workflows.execution_engine.entities.base import (
Batch,
WorkflowImageData,
)
from inference.core.workflows.prototypes.block import BlockResult

from .base import BaseOCRModel


class TrOCRModel(BaseOCRModel):

def run(
self,
images: Batch[WorkflowImageData],
step_execution_mode: StepExecutionMode,
post_process_result: Callable[
[Batch[WorkflowImageData], List[dict]], BlockResult
],
) -> BlockResult:
if step_execution_mode is StepExecutionMode.LOCAL:
return self.run_locally(images, post_process_result)
elif step_execution_mode is StepExecutionMode.REMOTE:
return self.run_remotely(images, post_process_result)

def run_locally(
self,
images: Batch[WorkflowImageData],
post_process_result: Callable[
[Batch[WorkflowImageData], List[dict]], BlockResult
],
) -> BlockResult:
predictions = []
for single_image in images:
inference_request = TrOCRInferenceRequest(
image=single_image.to_inference_format(numpy_preferred=True),
api_key=self.api_key,
)
trocr_model_id = load_core_model(
model_manager=self.model_manager,
inference_request=inference_request,
core_model="trocr",
)
result = self.model_manager.infer_from_request_sync(
trocr_model_id, inference_request
)
predictions.append(result.model_dump())
return post_process_result(images, predictions)

def run_remotely(
self,
images: Batch[WorkflowImageData],
post_process_result: Callable[
[Batch[WorkflowImageData], List[dict]], BlockResult
],
) -> BlockResult:
raise NotImplementedError(
"Remote execution is not implemented for TrOCRModel.",
)
Loading
Loading