Merge pull request #834 from hcharbonnier/PaddleOCR_Detection

Add support for PaddleOCR (detection only)
zyddnys · Feb 5, 2025 · 0f39e4a · 0f39e4a
2 parents b80fd18 + 3c1a0fe
commit 0f39e4a
Show file tree

Hide file tree

Showing 6 changed files with 126 additions and 3 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -10,7 +10,7 @@ COPY requirements.txt /app/requirements.txt
 
 RUN export TZ=Etc/UTC ; \
         apt update --yes \
-        && apt install g++ ffmpeg libsm6 libxext6 gimp --yes \
+        && apt install g++ ffmpeg libsm6 libxext6 gimp libcudnn8-dev --yes \
         && pip install -r /app/requirements.txt \
         && apt remove g++ --yes \
         && apt autoremove --yes \
@@ -21,7 +21,7 @@ COPY . /app
 # Prepare models
 RUN python -u docker_prepare.py --continue-on-error
 
-RUN rm -rf /tmp
+RUN rm -rf /tmp && mkdir /tmp && chmod 1777 /tmp
 
 # Add /app to Python module path
 ENV PYTHONPATH="/app"

diff --git a/manga_translator/config.py b/manga_translator/config.py
@@ -88,6 +88,7 @@ class Detector(str, Enum):
     dbconvnext = "dbconvnext"
     ctd = "ctd"
     craft = "craft"
+    paddle = "paddle"
     none = "none"
 
 class Inpainter(str, Enum):

diff --git a/manga_translator/detection/__init__.py b/manga_translator/detection/__init__.py
@@ -4,6 +4,7 @@
 from .dbnet_convnext import DBConvNextDetector
 from .ctd import ComicTextDetector
 from .craft import CRAFTDetector
+from .paddle import PaddleDetector
 from .none import NoneDetector
 from .common import CommonDetector, OfflineDetector
 from ..config import Detector
@@ -13,6 +14,7 @@
     Detector.dbconvnext: DBConvNextDetector,
     Detector.ctd: ComicTextDetector,
     Detector.craft: CRAFTDetector,
+    Detector.paddle: PaddleDetector,
     Detector.none: NoneDetector,
 }
 detector_cache = {}
@@ -34,5 +36,8 @@ async def dispatch(detector_key: Detector, image: np.ndarray, detect_size: int,
                    invert: bool, gamma_correct: bool, rotate: bool, auto_rotate: bool = False, device: str = 'cpu', verbose: bool = False):
     detector = get_detector(detector_key)
     if isinstance(detector, OfflineDetector):
-        await detector.load(device)
+        if isinstance(detector, PaddleDetector):
+            await detector.load(device, text_threshold=text_threshold, box_threshold=box_threshold, unclip_ratio=unclip_ratio, invert=invert, verbose=verbose)
+        else:
+            await detector.load(device)
     return await detector.detect(image, detect_size, text_threshold, box_threshold, unclip_ratio, invert, gamma_correct, rotate, auto_rotate, verbose)
diff --git a/manga_translator/detection/paddle.py b/manga_translator/detection/paddle.py
@@ -0,0 +1,114 @@
+import os
+import shutil
+import numpy as np
+import cv2
+from paddleocr import PaddleOCR
+from typing import List, Tuple
+
+from .common import OfflineDetector
+from ..utils import TextBlock, Quadrilateral
+from ..utils.inference import ModelWrapper
+
+MODEL = None
+
+class PaddleDetector(OfflineDetector, ModelWrapper):
+    _MODEL_MAPPING = {
+        'det': {
+            'url': 'https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_det_server_infer.tar',
+            'hash': '0c0e4fc2ef31dcfbb45fb8d29bd8e702ec55a240d62c32ff814270d8be6e6179',
+            'archive': {
+                'ch_PP-OCRv4_det_server_infer/inference.pdiparams': 'ch_PP-OCRv4_det_server_infer/',
+                'ch_PP-OCRv4_det_server_infer/inference.pdiparams.info': 'ch_PP-OCRv4_det_server_infer/',
+                'ch_PP-OCRv4_det_server_infer/inference.pdmodel': 'ch_PP-OCRv4_det_server_infer/',
+            },
+        },
+        'rec': {
+            'url': 'https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_rec_infer.tar',
+            'hash': '830ea228e20c2b30c4db9666066c48512f67a63f5b1a32d0d33dc9170040ce7d',
+            'archive': {
+                'ch_PP-OCRv4_rec_infer/inference.pdiparams': 'ch_PP-OCRv4_rec_infer/',
+                'ch_PP-OCRv4_rec_infer/inference.pdiparams.info': 'ch_PP-OCRv4_rec_infer/',
+                'ch_PP-OCRv4_rec_infer/inference.pdmodel': 'ch_PP-OCRv4_rec_infer/',
+            },
+        },
+        'cls': {
+            'url': 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar',
+            'hash': '507352585040d035da3b1e6374694ad679a850acb0a36a8d0d47984176357717',
+            'archive': {
+                'ch_ppocr_mobile_v2.0_cls_infer/inference.pdiparams': 'ch_ppocr_mobile_v2.0_cls_infer/',
+                'ch_ppocr_mobile_v2.0_cls_infer/inference.pdmodel': 'ch_ppocr_mobile_v2.0_cls_infer/',
+            },
+        },
+    }
+
+    def __init__(self, *args, **kwargs):
+        ModelWrapper.__init__(self)
+        super().__init__(*args, **kwargs)
+
+    async def _load(self, device: str, text_threshold: float, box_threshold: float, unclip_ratio: float, invert: bool = False, verbose: bool = False):
+        await self.download()
+        self.device = device
+        self.text_threshold = text_threshold
+        self.box_threshold = box_threshold
+        self.unclip_ratio = unclip_ratio
+        self.invert = invert
+        self.verbose = verbose
+        if device in ['cuda', 'mps']:
+            self.use_gpu = True
+        else:
+            self.use_gpu = False
+        global MODEL
+        MODEL = PaddleOCR(
+            use_gpu=self.use_gpu,
+            use_angle_cls=False,
+            det_model_dir=self.model_dir+'/ch_PP-OCRv4_det_server_infer',
+            rec_model_dir=self.model_dir+'/ch_PP-OCRv4_rec_infer',
+            cls_model_dir=self.model_dir+'/ch_ppocr_mobile_v2.0_cls_infer',
+            det=True,
+            rec=False,
+            cls=False,
+            det_db_thresh=self.text_threshold,
+            det_db_box_thresh=self.box_threshold,
+            det_db_unclip_ratio=self.unclip_ratio,
+            invert=self.invert,
+            verbose=self.verbose,
+        )
+
+    async def _unload(self):
+        global MODEL
+        MODEL = None
+
+    async def _infer(self, image: np.ndarray, detect_size: int, text_threshold: float, box_threshold: float,
+                     unclip_ratio: float, verbose: bool = False):
+        global MODEL
+        result = MODEL.ocr(image, det=True, rec=False)
+
+        textlines = []
+
+        # Parse OCR results and filter by text threshold
+        for line in result[0]:
+            points = np.array(line).astype(np.int32)
+            # paddleocr does not return score, so we use a fixed value: 1
+            textlines.append(Quadrilateral(points, '', 1))
+
+        # Create a binary mask
+        mask = np.zeros((image.shape[0], image.shape[1]), dtype=np.uint8)
+        for textline in textlines:
+            cv2.fillPoly(mask, [textline.pts], color=255)
+
+        # Additional polygon refinement
+        refined_polys = []
+        for textline in textlines:
+            poly = cv2.minAreaRect(textline.pts)
+            box = cv2.boxPoints(poly)
+            box = np.int0(box)
+            refined_polys.append(np.roll(box, 2, axis=0))  # Ensure clockwise order
+
+        # Update mask with refined polygons
+        for poly in refined_polys:
+            mask = cv2.fillPoly(mask, [poly], color=255)
+
+        # Return textlines with refined polygons
+        textlines = [Quadrilateral(poly, '', 1) for poly, textline in zip(refined_polys, textlines)]
+
+        return textlines, mask, None
diff --git a/requirements.txt b/requirements.txt
@@ -54,3 +54,5 @@ uvicorn
 fastapi
 pydantic
 python-multipart
+paddleocr
+paddlepaddle-gpu==2.5.2
diff --git a/server/index.html b/server/index.html
@@ -49,6 +49,7 @@ <h1 class="text-center text-lg font-light">Image/Manga Translator</h1>
                                 v-model="textDetector">
                             <option value="default">Default</option>
                             <option value="ctd">CTD</option>
+                            <option value="paddle">Paddle</option>
                         </select>
                         <i class="iconify absolute top-1.5 right-1 pointer-events-none"
                            data-icon="carbon:chevron-down"></i>