Skip to content

Commit

Permalink
Merge pull request #834 from hcharbonnier/PaddleOCR_Detection
Browse files Browse the repository at this point in the history
Add support for PaddleOCR (detection only)
  • Loading branch information
zyddnys authored Feb 5, 2025
2 parents b80fd18 + 3c1a0fe commit 0f39e4a
Show file tree
Hide file tree
Showing 6 changed files with 126 additions and 3 deletions.
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ COPY requirements.txt /app/requirements.txt

RUN export TZ=Etc/UTC ; \
apt update --yes \
&& apt install g++ ffmpeg libsm6 libxext6 gimp --yes \
&& apt install g++ ffmpeg libsm6 libxext6 gimp libcudnn8-dev --yes \
&& pip install -r /app/requirements.txt \
&& apt remove g++ --yes \
&& apt autoremove --yes \
Expand All @@ -21,7 +21,7 @@ COPY . /app
# Prepare models
RUN python -u docker_prepare.py --continue-on-error

RUN rm -rf /tmp
RUN rm -rf /tmp && mkdir /tmp && chmod 1777 /tmp

# Add /app to Python module path
ENV PYTHONPATH="/app"
Expand Down
1 change: 1 addition & 0 deletions manga_translator/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ class Detector(str, Enum):
dbconvnext = "dbconvnext"
ctd = "ctd"
craft = "craft"
paddle = "paddle"
none = "none"

class Inpainter(str, Enum):
Expand Down
7 changes: 6 additions & 1 deletion manga_translator/detection/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from .dbnet_convnext import DBConvNextDetector
from .ctd import ComicTextDetector
from .craft import CRAFTDetector
from .paddle import PaddleDetector
from .none import NoneDetector
from .common import CommonDetector, OfflineDetector
from ..config import Detector
Expand All @@ -13,6 +14,7 @@
Detector.dbconvnext: DBConvNextDetector,
Detector.ctd: ComicTextDetector,
Detector.craft: CRAFTDetector,
Detector.paddle: PaddleDetector,
Detector.none: NoneDetector,
}
detector_cache = {}
Expand All @@ -34,5 +36,8 @@ async def dispatch(detector_key: Detector, image: np.ndarray, detect_size: int,
invert: bool, gamma_correct: bool, rotate: bool, auto_rotate: bool = False, device: str = 'cpu', verbose: bool = False):
detector = get_detector(detector_key)
if isinstance(detector, OfflineDetector):
await detector.load(device)
if isinstance(detector, PaddleDetector):
await detector.load(device, text_threshold=text_threshold, box_threshold=box_threshold, unclip_ratio=unclip_ratio, invert=invert, verbose=verbose)
else:
await detector.load(device)
return await detector.detect(image, detect_size, text_threshold, box_threshold, unclip_ratio, invert, gamma_correct, rotate, auto_rotate, verbose)
114 changes: 114 additions & 0 deletions manga_translator/detection/paddle.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import os
import shutil
import numpy as np
import cv2
from paddleocr import PaddleOCR
from typing import List, Tuple

from .common import OfflineDetector
from ..utils import TextBlock, Quadrilateral
from ..utils.inference import ModelWrapper

MODEL = None

class PaddleDetector(OfflineDetector, ModelWrapper):
_MODEL_MAPPING = {
'det': {
'url': 'https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_det_server_infer.tar',
'hash': '0c0e4fc2ef31dcfbb45fb8d29bd8e702ec55a240d62c32ff814270d8be6e6179',
'archive': {
'ch_PP-OCRv4_det_server_infer/inference.pdiparams': 'ch_PP-OCRv4_det_server_infer/',
'ch_PP-OCRv4_det_server_infer/inference.pdiparams.info': 'ch_PP-OCRv4_det_server_infer/',
'ch_PP-OCRv4_det_server_infer/inference.pdmodel': 'ch_PP-OCRv4_det_server_infer/',
},
},
'rec': {
'url': 'https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_rec_infer.tar',
'hash': '830ea228e20c2b30c4db9666066c48512f67a63f5b1a32d0d33dc9170040ce7d',
'archive': {
'ch_PP-OCRv4_rec_infer/inference.pdiparams': 'ch_PP-OCRv4_rec_infer/',
'ch_PP-OCRv4_rec_infer/inference.pdiparams.info': 'ch_PP-OCRv4_rec_infer/',
'ch_PP-OCRv4_rec_infer/inference.pdmodel': 'ch_PP-OCRv4_rec_infer/',
},
},
'cls': {
'url': 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar',
'hash': '507352585040d035da3b1e6374694ad679a850acb0a36a8d0d47984176357717',
'archive': {
'ch_ppocr_mobile_v2.0_cls_infer/inference.pdiparams': 'ch_ppocr_mobile_v2.0_cls_infer/',
'ch_ppocr_mobile_v2.0_cls_infer/inference.pdmodel': 'ch_ppocr_mobile_v2.0_cls_infer/',
},
},
}

def __init__(self, *args, **kwargs):
ModelWrapper.__init__(self)
super().__init__(*args, **kwargs)

async def _load(self, device: str, text_threshold: float, box_threshold: float, unclip_ratio: float, invert: bool = False, verbose: bool = False):
await self.download()
self.device = device
self.text_threshold = text_threshold
self.box_threshold = box_threshold
self.unclip_ratio = unclip_ratio
self.invert = invert
self.verbose = verbose
if device in ['cuda', 'mps']:
self.use_gpu = True
else:
self.use_gpu = False
global MODEL
MODEL = PaddleOCR(
use_gpu=self.use_gpu,
use_angle_cls=False,
det_model_dir=self.model_dir+'/ch_PP-OCRv4_det_server_infer',
rec_model_dir=self.model_dir+'/ch_PP-OCRv4_rec_infer',
cls_model_dir=self.model_dir+'/ch_ppocr_mobile_v2.0_cls_infer',
det=True,
rec=False,
cls=False,
det_db_thresh=self.text_threshold,
det_db_box_thresh=self.box_threshold,
det_db_unclip_ratio=self.unclip_ratio,
invert=self.invert,
verbose=self.verbose,
)

async def _unload(self):
global MODEL
MODEL = None

async def _infer(self, image: np.ndarray, detect_size: int, text_threshold: float, box_threshold: float,
unclip_ratio: float, verbose: bool = False):
global MODEL
result = MODEL.ocr(image, det=True, rec=False)

textlines = []

# Parse OCR results and filter by text threshold
for line in result[0]:
points = np.array(line).astype(np.int32)
# paddleocr does not return score, so we use a fixed value: 1
textlines.append(Quadrilateral(points, '', 1))

# Create a binary mask
mask = np.zeros((image.shape[0], image.shape[1]), dtype=np.uint8)
for textline in textlines:
cv2.fillPoly(mask, [textline.pts], color=255)

# Additional polygon refinement
refined_polys = []
for textline in textlines:
poly = cv2.minAreaRect(textline.pts)
box = cv2.boxPoints(poly)
box = np.int0(box)
refined_polys.append(np.roll(box, 2, axis=0)) # Ensure clockwise order

# Update mask with refined polygons
for poly in refined_polys:
mask = cv2.fillPoly(mask, [poly], color=255)

# Return textlines with refined polygons
textlines = [Quadrilateral(poly, '', 1) for poly, textline in zip(refined_polys, textlines)]

return textlines, mask, None
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,5 @@ uvicorn
fastapi
pydantic
python-multipart
paddleocr
paddlepaddle-gpu==2.5.2
1 change: 1 addition & 0 deletions server/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ <h1 class="text-center text-lg font-light">Image/Manga Translator</h1>
v-model="textDetector">
<option value="default">Default</option>
<option value="ctd">CTD</option>
<option value="paddle">Paddle</option>
</select>
<i class="iconify absolute top-1.5 right-1 pointer-events-none"
data-icon="carbon:chevron-down"></i>
Expand Down

0 comments on commit 0f39e4a

Please sign in to comment.