nnstreamer · jijoongmoon · May 23, 2024 · May 17, 2024 · May 22, 2024
@@ -22,6 +22,7 @@
 class YOLODataset(Dataset):
     def __init__(self, img_dir, ann_dir):
         super().__init__()
+        self.img_dir = img_dir
         pattern = re.compile("\/(\d+)\.")
         img_list = glob.glob(img_dir + "*")
         ann_list = glob.glob(ann_dir + "*")
@@ -30,12 +31,11 @@ def __init__(self, img_dir, ann_dir):
         ann_ids = list(map(lambda x: pattern.search(x).group(1), ann_list))
         ids_list = list(set(img_ids) & set(ann_ids))
 
-        self.input_images = []
+        self.ids_list = []
         self.bbox_gt = []
         self.cls_gt = []
 
         for ids in ids_list:
-            img = np.array(Image.open(img_dir + ids + ".jpg").resize((416, 416))) / 255
             label_bbox = []
             label_cls = []
             with open(ann_dir + ids + ".txt", "rt", encoding="utf-8") as f:
@@ -47,19 +47,27 @@ def __init__(self, img_dir, ann_dir):
             if len(label_cls) == 0:
                 continue
 
-            self.input_images.append(img)
+            self.ids_list.append(ids)
             self.bbox_gt.append(label_bbox)
             self.cls_gt.append(label_cls)
 
-        self.length = len(self.input_images)
-        self.input_images = np.array(self.input_images)
-        self.input_images = torch.FloatTensor(self.input_images).permute((0, 3, 1, 2))
+        self.length = len(self.ids_list)
 
     def __len__(self):
         return self.length
 
     def __getitem__(self, idx):
-        return self.input_images[idx], self.bbox_gt[idx], self.cls_gt[idx]
+        img = (
+            torch.FloatTensor(
+                np.array(
+                    Image.open(self.img_dir + self.ids_list[idx] + ".jpg").resize(
+                        (416, 416)
+                    )
+                )
+            ).permute((2, 0, 1))
+            / 255
+        )
+        return img, self.bbox_gt[idx], self.cls_gt[idx]
 
 
 ##

@@ -10,14 +10,17 @@
 import sys
 import os
 
-from torchconverter import save_bin
-import torch
+from PIL import Image, ImageDraw
+from matplotlib import pyplot as plt
 from torch import optim
 from torch.utils.data import DataLoader
+import torch
+import numpy as np
 
 from yolo import YoloV2
 from yolo_loss import YoloV2_LOSS
 from dataset import YOLODataset, collate_db
+from torchconverter import save_bin
 
 device = "cuda" if torch.cuda.is_available() else "cpu"
 
@@ -137,10 +140,9 @@ def get_util_path():
           valid loss: {epoch_valid_loss / len(valid_loader):.4f}"
     )
 
+
 ##
 # @brief bbox post process function for inference
-
-
 def post_process_for_bbox(bbox_p):
     """
     @param bbox_p shape(batch_size, cell_h x cell_w, num_anchors, 4)
@@ -175,8 +177,32 @@ def post_process_for_bbox(bbox_p):
     return bbox_p
 
 
+def visualize_bbox(img_pred, bbox_preds):
+    img_array = (img_pred.to("cpu") * 255).permute((1, 2, 0)).numpy().astype(np.uint8)
+    img = Image.fromarray(img_array)
+
+    for bbox_pred in bbox_preds:
+        bbox_pred = [int(x * 416) for x in bbox_pred]
+
+        if sum(bbox_pred) == 0:
+            continue
+
+        x_lefttop = bbox_pred[0]
+        y_lefttop = bbox_pred[1]
+        width = bbox_pred[2]
+        height = bbox_pred[3]
+
+        draw = ImageDraw.Draw(img)
+        draw.rectangle(
+            [(x_lefttop, y_lefttop), (x_lefttop + width, y_lefttop + height)]
+        )
+
+    plt.imshow(img)
+    plt.show()
+
+
 # inference example using trained model
-hypothesis = model(img).permute((0, 2, 3, 1))
+hypothesis = model(img.to(device)).permute((0, 2, 3, 1))
 hypothesis = hypothesis[0].reshape((1, out_size**2, num_anchors, 5 + num_classes))
 
 # transform output
@@ -192,4 +218,5 @@ def post_process_for_bbox(bbox_p):
 
 # result of inference (data range 0~1)
 iou_mask = iou_pred > 0.5
-print(bbox_pred * iou_mask, iou_pred * iou_mask, prob_pred * iou_mask)
+bbox_pred = bbox_pred * iou_mask
+visualize_bbox(img, bbox_pred.reshape(-1, 4))
@@ -20,98 +20,122 @@ def __init__(self, num_classes, num_anchors=5):
         self.num_classes = num_classes
         self.num_anchors = num_anchors
         self.conv1 = nn.Sequential(
-            nn.Conv2d(3, 32, 3, 1, 1),
-            nn.BatchNorm2d(32, eps=1e-3),
-            nn.LeakyReLU(),
+            nn.Conv2d(3, 32, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(32),
+            nn.LeakyReLU(0.1),
             nn.MaxPool2d(2, 2),
         )
         self.conv2 = nn.Sequential(
-            nn.Conv2d(32, 64, 3, 1, 1),
-            nn.BatchNorm2d(64, eps=1e-3),
-            nn.LeakyReLU(),
+            nn.Conv2d(32, 64, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(64),
+            nn.LeakyReLU(0.1),
             nn.MaxPool2d(2, 2),
         )
         self.conv3 = nn.Sequential(
-            nn.Conv2d(64, 128, 3, 1, 1), nn.BatchNorm2d(128, eps=1e-3), nn.LeakyReLU()
+            nn.Conv2d(64, 128, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(128),
+            nn.LeakyReLU(0.1),
         )
         self.conv4 = nn.Sequential(
-            nn.Conv2d(128, 64, 1, 1, 0), nn.BatchNorm2d(64, eps=1e-3), nn.LeakyReLU()
+            nn.Conv2d(128, 64, 1, 1, 0, bias=False),
+            nn.BatchNorm2d(64),
+            nn.LeakyReLU(0.1),
         )
         self.conv5 = nn.Sequential(
-            nn.Conv2d(64, 128, 3, 1, 1),
-            nn.BatchNorm2d(128, eps=1e-3),
-            nn.LeakyReLU(),
+            nn.Conv2d(64, 128, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(128),
+            nn.LeakyReLU(0.1),
             nn.MaxPool2d(2, 2),
         )
         self.conv6 = nn.Sequential(
-            nn.Conv2d(128, 256, 3, 1, 1), nn.BatchNorm2d(256, eps=1e-3), nn.LeakyReLU()
+            nn.Conv2d(128, 256, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(256),
+            nn.LeakyReLU(0.1),
         )
         self.conv7 = nn.Sequential(
-            nn.Conv2d(256, 128, 1, 1, 0), nn.BatchNorm2d(128, eps=1e-3), nn.LeakyReLU()
+            nn.Conv2d(256, 128, 1, 1, 0, bias=False),
+            nn.BatchNorm2d(128),
+            nn.LeakyReLU(0.1),
         )
         self.conv8 = nn.Sequential(
-            nn.Conv2d(128, 256, 3, 1, 1),
-            nn.BatchNorm2d(256, eps=1e-3),
-            nn.LeakyReLU(),
+            nn.Conv2d(128, 256, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(256),
+            nn.LeakyReLU(0.1),
             nn.MaxPool2d(2, 2),
         )
         self.conv9 = nn.Sequential(
-            nn.Conv2d(256, 512, 3, 1, 1), nn.BatchNorm2d(512, eps=1e-3), nn.LeakyReLU()
+            nn.Conv2d(256, 512, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(512),
+            nn.LeakyReLU(0.1),
         )
         self.conv10 = nn.Sequential(
-            nn.Conv2d(512, 256, 1, 1, 0), nn.BatchNorm2d(256, eps=1e-3), nn.LeakyReLU()
+            nn.Conv2d(512, 256, 1, 1, 0, bias=False),
+            nn.BatchNorm2d(256),
+            nn.LeakyReLU(0.1),
         )
         self.conv11 = nn.Sequential(
-            nn.Conv2d(256, 512, 3, 1, 1), nn.BatchNorm2d(512, eps=1e-3), nn.LeakyReLU()
+            nn.Conv2d(256, 512, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(512),
+            nn.LeakyReLU(0.1),
         )
         self.conv12 = nn.Sequential(
-            nn.Conv2d(512, 256, 1, 1, 0), nn.BatchNorm2d(256, eps=1e-3), nn.LeakyReLU()
+            nn.Conv2d(512, 256, 1, 1, 0, bias=False),
+            nn.BatchNorm2d(256),
+            nn.LeakyReLU(0.1),
         )
         self.conv13 = nn.Sequential(
-            nn.Conv2d(256, 512, 3, 1, 1), nn.BatchNorm2d(512, eps=1e-3), nn.LeakyReLU()
+            nn.Conv2d(256, 512, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(512),
+            nn.LeakyReLU(0.1),
         )
 
         self.conv_b = nn.Sequential(
-            nn.Conv2d(512, 64, 1, 1, 0), nn.BatchNorm2d(64, eps=1e-3), nn.LeakyReLU()
+            nn.Conv2d(512, 64, 1, 1, 0, bias=False),
+            nn.BatchNorm2d(64),
+            nn.LeakyReLU(0.1),
         )
 
         self.maxpool_a = nn.MaxPool2d(2, 2)
         self.conv_a1 = nn.Sequential(
-            nn.Conv2d(512, 1024, 3, 1, 1),
-            nn.BatchNorm2d(1024, eps=1e-3),
-            nn.LeakyReLU(),
+            nn.Conv2d(512, 1024, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(1024),
+            nn.LeakyReLU(0.1),
         )
         self.conv_a2 = nn.Sequential(
-            nn.Conv2d(1024, 512, 1, 1, 0), nn.BatchNorm2d(512, eps=1e-3), nn.LeakyReLU()
+            nn.Conv2d(1024, 512, 1, 1, 0, bias=False),
+            nn.BatchNorm2d(512),
+            nn.LeakyReLU(0.1),
         )
         self.conv_a3 = nn.Sequential(
-            nn.Conv2d(512, 1024, 3, 1, 1),
-            nn.BatchNorm2d(1024, eps=1e-3),
-            nn.LeakyReLU(),
+            nn.Conv2d(512, 1024, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(1024),
+            nn.LeakyReLU(0.1),
         )
         self.conv_a4 = nn.Sequential(
-            nn.Conv2d(1024, 512, 1, 1, 0), nn.BatchNorm2d(512, eps=1e-3), nn.LeakyReLU()
+            nn.Conv2d(1024, 512, 1, 1, 0, bias=False),
+            nn.BatchNorm2d(512),
+            nn.LeakyReLU(0.1),
         )
         self.conv_a5 = nn.Sequential(
-            nn.Conv2d(512, 1024, 3, 1, 1),
-            nn.BatchNorm2d(1024, eps=1e-3),
-            nn.LeakyReLU(),
+            nn.Conv2d(512, 1024, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(1024),
+            nn.LeakyReLU(0.1),
         )
         self.conv_a6 = nn.Sequential(
-            nn.Conv2d(1024, 1024, 3, 1, 1),
-            nn.BatchNorm2d(1024, eps=1e-3),
-            nn.LeakyReLU(),
+            nn.Conv2d(1024, 1024, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(1024),
+            nn.LeakyReLU(0.1),
         )
         self.conv_a7 = nn.Sequential(
-            nn.Conv2d(1024, 1024, 3, 1, 1),
-            nn.BatchNorm2d(1024, eps=1e-3),
-            nn.LeakyReLU(),
+            nn.Conv2d(1024, 1024, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(1024),
+            nn.LeakyReLU(0.1),
         )
 
         self.conv_out1 = nn.Sequential(
-            nn.Conv2d(1280, 1024, 3, 1, 1),
-            nn.BatchNorm2d(1024, eps=1e-3),
-            nn.LeakyReLU(),
+            nn.Conv2d(1280, 1024, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(1024),
+            nn.LeakyReLU(0.1),
         )
 
         self.conv_out2 = nn.Conv2d(1024, self.num_anchors * (5 + num_classes), 1, 1, 0)

@@ -139,6 +139,7 @@ std::vector<LayerHandle> yoloBlock(const std::string &block_name,
       withKey("filters", filters),
       withKey("kernel_size", {kernel_size, kernel_size}),
       withKey("padding", padding),
+      withKey("disable_bias", "true"),
       withKey("input_layers", input_layer)};
 
     return createLayer("conv2d", props);
@@ -150,6 +151,7 @@ std::vector<LayerHandle> yoloBlock(const std::string &block_name,
   if (downsample) {
     LayerHandle a2 = createLayer("batch_normalization",
                                  {with_name("a2"), withKey("momentum", "0.9"),
+                                  withKey("epsilon", 0.00001),
                                   withKey("activation", "leaky_relu")});
 
     LayerHandle a3 = createLayer(
@@ -158,10 +160,10 @@ std::vector<LayerHandle> yoloBlock(const std::string &block_name,
 
     return {a1, a2, a3};
   } else {
-    LayerHandle a2 =
-      createLayer("batch_normalization",
-                  {withKey("name", block_name), withKey("momentum", "0.9"),
-                   withKey("activation", "leaky_relu")});
+    LayerHandle a2 = createLayer(
+      "batch_normalization",
+      {withKey("name", block_name), withKey("momentum", "0.9"),
+       withKey("epsilon", 0.00001), withKey("activation", "leaky_relu")});
 
     return {a1, a2};
   }