Merge branch 'main' of github.com:zyddnys/manga-image-translator

zyddnys · Dec 21, 2023 · b762cb2 · b762cb2
2 parents a7af9bb + a33ee06
commit b762cb2
Show file tree

Hide file tree

Showing 24 changed files with 119 additions and 103 deletions.
diff --git a/.gitignore b/.gitignore
@@ -38,3 +38,4 @@ share/python-wheels/
 .installed.cfg
 *.egg
 MANIFEST
+.history
diff --git a/Makefile b/Makefile
@@ -8,6 +8,6 @@ run-web-server:
 		--manga2eng \
 		--verbose \
 		--mode=web \
-		--use-cuda \ 
+		--use-gpu \ 
 		--host=0.0.0.0 \
 		--port=5003
diff --git a/README.md b/README.md
@@ -54,7 +54,7 @@ Python 3.10.6
 # Clone this repo
 $ git clone https://github.com/zyddnys/manga-image-translator.git
 
-# For --use-cuda option go to https://pytorch.org/ and follow
+# For --use-gpu option go to https://pytorch.org/ and follow
 # pytorch installation instructions. Add `--upgrade --force-reinstall`
 # to the pip command to overwrite the currently installed pytorch version.
 
@@ -92,7 +92,7 @@ Also, if you have trouble installing pydensecrf with the command above you can i
 #### Batch mode (default)
 
 ```bash
-# use `--use-cuda` for speedup if you have a compatible NVIDIA GPU.
+# use `--use-gpu` for speedup if you have a compatible NVIDIA GPU or using Apple Silicon.
 # use `--target-lang <language_code>` to specify a target language.
 # use `--inpainter=none` to disable inpainting.
 # use `--translator=none` if you only want to use inpainting (blank bubbles)
@@ -115,7 +115,7 @@ $ python -m manga_translator --mode demo -v --translator=google -l ENG -i <path>
 
 ```bash
 # use `--mode web` to start a web server.
-$ python -m manga_translator -v --mode web --use-cuda
+$ python -m manga_translator -v --mode web --use-gpu
 # the demo will be serving on http://127.0.0.1:5003
 ```
 
@@ -129,7 +129,7 @@ Basic manual translation demo can be found at <http://127.0.0.1:5003/manual> whe
 
 ```bash
 # use `--mode api` to start a web server.
-$ python -m manga_translator -v --mode api --use-cuda
+$ python -m manga_translator -v --mode api --use-gpu
 # the api will be serving on http://127.0.0.1:5003
 ```
 Api is accepting json(post) and multipart.
@@ -393,8 +393,8 @@ THA: Thai
 --overwrite                                  Overwrite already translated images in batch mode.
 --skip-no-text                               Skip image without text (Will not be saved).
 --model-dir MODEL_DIR                        Model directory (by default ./models in project root)
---use-cuda                                   Turn on/off cuda
---use-cuda-limited                           Turn on/off cuda (excluding offline translator)
+--use-gpu                                   Turn on/off gpu ( automatic del)
+--use-gpu-limited                           Turn on/off gpu (excluding offline translator)
 --detector {default,ctd,craft,none}          Text detector used for creating a text mask from an
                                              image, DO NOT use craft for manga, it's not designed
                                              for it
@@ -556,7 +556,7 @@ docker run --env="DEEPL_AUTH_KEY=xxx" --ipc=host --rm zyddnys/manga-image-transl
 To run the container with the following flags set:
 
 ```bash
-docker run ... --gpus=all ... zyddnys/manga-image-translator:main ... --use-cuda
+docker run ... --gpus=all ... zyddnys/manga-image-translator:main ... --use-gpu
 ```
 
 Or (For the web server + GPU)

diff --git a/README_CN.md b/README_CN.md
@@ -125,8 +125,8 @@ THA: Thai
 --overwrite                                  Overwrite already translated images in batch mode.
 --skip-no-text                               Skip image without text (Will not be saved).
 --model-dir MODEL_DIR                        Model directory (by default ./models in project root)
---use-cuda                                   Turn on/off cuda
---use-cuda-limited                           Turn on/off cuda (excluding offline translator)
+--use-gpu                                   Turn on/off gpu (automatic selection between mps or cuda)
+--use-gpu-limited                           Turn on/off gpu (excluding offline translator)
 --detector {default,ctd,craft,none}          Text detector used for creating a text mask from an
                                              image, DO NOT use craft for manga, it's not designed
                                              for it
@@ -229,13 +229,13 @@ THA: Thai
 ### 使用命令行执行
 
 ```bash
-# 如果机器有支持 CUDA 的 NVIDIA GPU，可以添加 `--use-cuda` 参数
-# 使用 `--use-cuda-limited` 将需要使用大量显存的翻译交由CPU执行，这样可以减少显存占用
+# 如果机器有支持 CUDA 的 NVIDIA GPU，可以添加 `--use-gpu` 参数
+# 使用 `--use-gpu-limited` 将需要使用大量显存的翻译交由CPU执行，这样可以减少显存占用
 # 使用 `--translator=<翻译器名称>` 来指定翻译器
 # 使用 `--target-lang=<语言代码>` 来指定目标语言
 # 将 <图片文件路径> 替换为图片的路径
 # 如果你要翻译的图片比较小或者模糊，可以使用upscaler提升图像大小与质量，从而提升检测翻译效果
-$ python -m manga_translator --verbose --use-cuda --translator=google --target-lang=CHS -i <path_to_image_file>
+$ python -m manga_translator --verbose --use-gpu --translator=google --target-lang=CHS -i <path_to_image_file>
 # 结果会存放到 result 文件夹里
 ```
 
@@ -245,7 +245,7 @@ $ python -m manga_translator --verbose --use-cuda --translator=google --target-l
 # 其它参数如上
 # 使用 `--mode batch` 开启批量翻译模式
 # 将 <图片文件夹路径> 替换为图片文件夹的路径
-$ python -m manga_translator --verbose --mode batch --use-cuda --translator=google --target-lang=CHS -i <图片文件夹路径>
+$ python -m manga_translator --verbose --mode batch --use-gpu --translator=google --target-lang=CHS -i <图片文件夹路径>
 # 结果会存放到 `<图片文件夹路径>-translated` 文件夹里
 ```
 
@@ -254,7 +254,7 @@ $ python -m manga_translator --verbose --mode batch --use-cuda --translator=goog
 ```bash
 # 其它参数如上
 # 使用 `--mode web` 开启 Web 服务器模式
-$ python -m manga_translator --verbose --mode web --use-cuda
+$ python -m manga_translator --verbose --mode web --use-gpu
 # 程序服务会开启在 http://127.0.0.1:5003
 ```
 

diff --git a/demo/doc/docker-compose-local-dev.yml b/demo/doc/docker-compose-local-dev.yml
@@ -4,9 +4,9 @@ services:
     build:
       context: ./../../
     container_name: manga_image_translator_localdev
-    command: --verbose --translator=offline --log-web --mode web --manga2eng --use-inpainting --host=0.0.0.0 --port=5003 # (FOR GPU) --use-cuda
+    command: --verbose --translator=offline --log-web --mode web --manga2eng --use-inpainting --host=0.0.0.0 --port=5003 # (FOR GPU) --use-gpu
     # Batch Mode
-    #command: --mode=batch --verbose --translator=offline --target-lang=ENG --manga2eng --use-inpainting --image "/app/Manga"# (FOR GPU) --use-cuda
+    #command: --mode=batch --verbose --translator=offline --target-lang=ENG --manga2eng --use-inpainting --image "/app/Manga"# (FOR GPU) --use-gpu
     volumes:
       - ./../../result:/app/result
       - ./../../detection:/app/detection

diff --git a/manga_translator/args.py b/manga_translator/args.py
@@ -100,8 +100,8 @@ def _format_action_invocation(self, action: argparse.Action) -> str:
 parser.add_argument('--model-dir', default=None, type=dir_path, help='Model directory (by default ./models in project root)')
 
 g = parser.add_mutually_exclusive_group()
-g.add_argument('--use-cuda', action='store_true', help='Turn on/off cuda')
-g.add_argument('--use-cuda-limited', action='store_true', help='Turn on/off cuda (excluding offline translator)')
+g.add_argument('--use-gpu', action='store_true', help='Turn on/off gpu (auto switch between mps and cuda)')
+g.add_argument('--use-gpu-limited', action='store_true', help='Turn on/off gpu (excluding offline translator)')
 
 parser.add_argument('--detector', default='default', type=str, choices=DETECTORS, help='Text detector used for creating a text mask from an image, DO NOT use craft for manga, it\'s not designed for it')
 parser.add_argument('--ocr', default='48px', type=str, choices=OCRS, help='Optical character recognition (OCR) model to use')

diff --git a/manga_translator/colorization/manga_colorization_v2_utils/denoising/denoiser.py b/manga_translator/colorization/manga_colorization_v2_utils/denoising/denoiser.py
@@ -29,7 +29,6 @@ def __init__(self, _device, _sigma = 25, _weights_dir = 'denoising/models/', _in
         self.weights_dir = _weights_dir
         self.channels = _in_ch
         self.device = _device
-
         self.model = FFDNet(num_input_channels = _in_ch)
         self.load_weights()
         self.model.eval()
@@ -39,11 +38,12 @@ def load_weights(self):
         weights_name = 'net_rgb.pth' if self.channels == 3 else 'net_gray.pth'
         weights_path = os.path.join(self.weights_dir, weights_name)
         if self.device == 'cuda':
+            # data paralles only for cuda , no need for mps devices
             state_dict = torch.load(weights_path, map_location=torch.device('cpu'))
-            device_ids = [0]
-            self.model = nn.DataParallel(self.model, device_ids=device_ids).cuda()
+            self.model = nn.DataParallel(self.model,device_ids = [0]).to(self.device)
         else:
-            state_dict = torch.load(weights_path, map_location='cpu')
+            # MPS devices don't support DataParallel
+            state_dict = torch.load(weights_path, map_location=self.device)
             # CPU mode: remove the DataParallel wrapper
             state_dict = remove_dataparallel_wrapper(state_dict)
         self.model.load_state_dict(state_dict)
@@ -90,6 +90,7 @@ def get_denoised_image(self, imorig, sigma = None):
         if self.device == 'cuda':
             dtype = torch.cuda.FloatTensor
         else:
+            # for mps devices is still floatTensor
             dtype = torch.FloatTensor
 
         imnoisy = imorig#.clone()

diff --git a/manga_translator/colorization/manga_colorization_v2_utils/denoising/functions.py b/manga_translator/colorization/manga_colorization_v2_utils/denoising/functions.py
@@ -39,6 +39,7 @@ def concatenate_input_noise_map(input, noise_sigma):
     if 'cuda' in dtype:
         downsampledfeatures = torch.cuda.FloatTensor(N, Cout, Hout, Wout).fill_(0)
     else:
+        # cpu and mps are the same
         downsampledfeatures = torch.FloatTensor(N, Cout, Hout, Wout).fill_(0)
 
     # Build the CxH/2xW/2 noise map

diff --git a/manga_translator/detection/craft.py b/manga_translator/detection/craft.py
@@ -138,9 +138,9 @@ async def _load(self, device: str):
         self.model_refiner.load_state_dict(copyStateDict(torch.load(self._get_file_path('craft_refiner_CTW1500.pth'), map_location='cpu')))
         self.model_refiner.eval()
         self.device = device
-        if device == 'cuda':
-            self.model = self.model.cuda()
-            self.model_refiner = self.model_refiner.cuda()
+        if device == 'cuda' or device == 'mps':
+            self.model = self.model.to(self.device)
+            self.model_refiner = self.model_refiner.to(self.device)
         global MODEL
         MODEL = self.model
 

diff --git a/manga_translator/detection/ctd.py b/manga_translator/detection/ctd.py
@@ -83,9 +83,9 @@ def __init__(self, *args, **kwargs):
 
     async def _load(self, device: str, input_size=1024, half=False, nms_thresh=0.35, conf_thresh=0.4):
         self.device = device
-        if self.device == 'cuda':
+        if self.device == 'cuda' or self.device == 'mps':
             self.model = TextDetBase(self._get_file_path('comictextdetector.pt'), device=self.device, act='leaky')
-            self.model.cuda()
+            self.model.to(self.device)
             self.backend = 'torch'
         else:
             model_path = self._get_file_path('comictextdetector.pt.onnx')

diff --git a/manga_translator/detection/default.py b/manga_translator/detection/default.py
@@ -45,8 +45,8 @@ async def _load(self, device: str):
         self.model.load_state_dict(sd['model'] if 'model' in sd else sd)
         self.model.eval()
         self.device = device
-        if device == 'cuda':
-            self.model = self.model.cuda()
+        if device == 'cuda' or device == 'mps':
+            self.model = self.model.to(self.device)
         global MODEL
         MODEL = self.model
 

diff --git a/manga_translator/inpainting/guided_ldm_inpainting.py b/manga_translator/inpainting/guided_ldm_inpainting.py
@@ -162,21 +162,18 @@ def img2img_inpaint(
         mask: Image.Image,
         ddim_steps = 50, 
         mask_blur: int = 16,
-        use_cuda: bool = True,
+        device: str = 'cpu',
         **kwargs) -> Image.Image :
         ddim_sampler = GuidedDDIMSample(self)
-        if use_cuda :
-            self.cond_stage_model.cuda()
-            self.first_stage_model.cuda()
+        # move to device mps, cuda or cpu
+        if device.startswith('cuda') or device == 'mps':
+            self.cond_stage_model.to(device)
+            self.first_stage_model.to(device)
         c_text = self.get_learned_conditioning([c_text])
         uc_text = self.get_learned_conditioning([uc_text])
         cond = {"c_crossattn": [c_text]}
         uc_cond = {"c_crossattn": [uc_text]}
 
-        if use_cuda :
-            device = torch.device('cuda:0')
-        else :
-            device = torch.device('cpu')
 
         image_mask = mask
         image_mask = image_mask.convert('L')
@@ -209,23 +206,23 @@ def img2img_inpaint(
         ddim_sampler.make_schedule(ddim_num_steps=steps, ddim_eta=eta, ddim_discretize="uniform", verbose=False)
         x1 = ddim_sampler.stochastic_encode(init_latent, torch.tensor([t_enc] * int(init_latent.shape[0])).to(device), noise=noise)
 
-        if use_cuda :
+        if device.startswith('cuda') or device == 'mps':
             self.cond_stage_model.cpu()
             self.first_stage_model.cpu()
 
-        if use_cuda :
-            self.model.cuda()
+        if device.startswith('cuda') or device == 'mps':
+            self.model.to(device)
         decoded = ddim_sampler.decode(x1, cond,t_enc,init_latent=init_latent,nmask=nmask,unconditional_guidance_scale=7,unconditional_conditioning=uc_cond)
-        if use_cuda :
+        if device.startswith('cuda') or device == 'mps':
             self.model.cpu()
 
         if mask is not None :
             decoded = init_latent * (1 - nmask) + decoded * nmask
 
-        if use_cuda :
-            self.first_stage_model.cuda()
+        if device.startswith('cuda') or device == 'mps':
+            self.first_stage_model.to(device)
         x_samples = self.decode_first_stage(decoded)
-        if use_cuda :
+        if device.startswith('cuda') or device == 'mps':
             self.first_stage_model.cpu()
         return torch.clip(x_samples, -1, 1)
 

diff --git a/manga_translator/inpainting/inpainting_aot.py b/manga_translator/inpainting/inpainting_aot.py
@@ -28,9 +28,9 @@ async def _load(self, device: str):
         sd = torch.load(self._get_file_path('inpainting.ckpt'), map_location='cpu')
         self.model.load_state_dict(sd['model'] if 'model' in sd else sd)
         self.model.eval()
-        self.use_cuda = device == 'cuda'
-        if self.use_cuda:
-            self.model = self.model.cuda()
+        self.device = device
+        if device.startswith('cuda') or device == 'mps':
+            self.model.to(device)
 
 
 def relu_nf(x):

diff --git a/manga_translator/inpainting/inpainting_lama.py b/manga_translator/inpainting/inpainting_lama.py
@@ -28,9 +28,9 @@ async def _load(self, device: str):
         sd = torch.load(self._get_file_path('inpainting_lama.ckpt'), map_location='cpu')
         model.load_state_dict(sd['model'] if 'model' in sd else sd)
         self.model.eval()
-        self.use_cuda = device == 'cuda'
-        if self.use_cuda:
-            self.model = self.model.cuda()
+        self.device = device
+        if device.startswith('cuda') or device == 'mps':
+            self.model.to(device)
 
 
 class DepthWiseSeparableConv(nn.Module):

diff --git a/manga_translator/inpainting/inpainting_lama_mpe.py b/manga_translator/inpainting/inpainting_lama_mpe.py
@@ -45,9 +45,9 @@ def __init__(self, *args, **kwargs):
     async def _load(self, device: str):
         self.model = load_lama_mpe(self._get_file_path('inpainting_lama_mpe.ckpt'), device='cpu')
         self.model.eval()
-        self.use_cuda = device == 'cuda'
-        if self.use_cuda:
-            self.model = self.model.cuda()
+        self.device = device
+        if device.startswith('cuda') or device == 'mps':
+            self.model.to(device)
 
     async def _unload(self):
         del self.model
@@ -84,12 +84,13 @@ async def _infer(self, image: np.ndarray, mask: np.ndarray, inpainting_size: int
         mask_torch = torch.from_numpy(mask).unsqueeze_(0).unsqueeze_(0).float() / 255.0
         mask_torch[mask_torch < 0.5] = 0
         mask_torch[mask_torch >= 0.5] = 1
-        if self.use_cuda:
-            img_torch = img_torch.cuda()
-            mask_torch = mask_torch.cuda()
+        if self.device.startswith('cuda') or self.device == 'mps':
+            img_torch = img_torch.to(self.device)
+            mask_torch = mask_torch.to(self.device)
         with torch.no_grad():
             img_torch *= (1 - mask_torch)
-            if not self.use_cuda:
+            if not (self.device.startswith('cuda')):
+                # mps devices here
                 img_inpainted_torch = self.model(img_torch, mask_torch)
             else:
                 # Note: lama's weight shouldn't be convert to fp16 or bf16 otherwise it produces darkened results.
@@ -126,9 +127,10 @@ class LamaLargeInpainter(LamaMPEInpainter):
     async def _load(self, device: str):
         self.model = load_lama_mpe(self._get_file_path('lama_large_512px.ckpt'), device='cpu', use_mpe=False, large_arch=True)
         self.model.eval()
-        self.use_cuda = device == 'cuda'
-        if self.use_cuda:
-            self.model = self.model.cuda()
+        self.device = device
+        if device.startswith('cuda') or device == 'mps':
+            self.model.to(device)
+
 
 
 def set_requires_grad(module, value):
@@ -687,6 +689,7 @@ def to(self, device):
             self.discriminator.to(device)
         if self.mpe is not None:
             self.mpe.to(device)
+        return self
 
     def eval(self):
         self.inpaint_only = True

diff --git a/manga_translator/inpainting/inpainting_sd.py b/manga_translator/inpainting/inpainting_sd.py
@@ -72,9 +72,8 @@ async def _load(self, device: str):
         load_ldm_sd(self.model, self._get_file_path('abyssorangemix2_Hard-inpainting.safetensors'))
         hack_everything()
         self.model.eval()
-        self.use_cuda = device == 'cuda'
-        if self.use_cuda:
-            self.model = self.model.cuda()
+        self.device = device
+        self.model = self.model.to(device)
 
     async def _unload(self):
         del self.model
@@ -111,23 +110,22 @@ async def _infer(self, image: np.ndarray, mask: np.ndarray, inpainting_size: int
         pos_prompt = ','.join([x for x in tags.keys() if x not in blacklist]).replace('_', ' ')
         pos_prompt = 'masterpiece,best quality,' + pos_prompt
         neg_prompt = 'worst quality, low quality, normal quality,text,text,text,text'
-
-        if self.use_cuda :
+        if self.device.startswith('cuda') :
             with torch.autocast(enabled = True, device_type = 'cuda') :
                 img = self.model.img2img_inpaint(
                     image = Image.fromarray(image),
                     c_text = pos_prompt,
                     uc_text = neg_prompt,
                     mask = Image.fromarray(mask),
-                    use_cuda = True
+                    device = self.device
                     )
         else :
             img = self.model.img2img_inpaint(
                 image = Image.fromarray(image),
                 c_text = pos_prompt,
                 uc_text = neg_prompt,
                 mask = Image.fromarray(mask),
-                use_cuda = False
+                device = self.device
                 )
 
         img_inpainted = (einops.rearrange(img, '1 c h w -> h w c').cpu().numpy() * 127.5 + 127.5).astype(np.uint8)