From 47a4ae7e05782de7f79e9db3673849a897ee5f40 Mon Sep 17 00:00:00 2001
From: Hassan Sarwat <hassansarwat96@gmail.com>
Date: Sat, 27 Jan 2024 20:25:11 +0100
Subject: [PATCH 1/9] updated code to later versions of torch and tf

---
 .gitignore                      |  5 +++++
 bin_mean_shift.py               |  2 +-
 data_tools/RecordReaderAll.py   |  3 ++-
 data_tools/convert_tfrecords.py | 16 +++++++++-------
 utils/loss.py                   | 11 ++++++-----
 utils/metric.py                 |  4 ++--
 6 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/.gitignore b/.gitignore
index 076b72e..bc480cd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,11 @@
 #ide
 .idea/
 
+# Data
+output_dir/
+datasets/
+
+
 # checkpoints
 experiments/
 
diff --git a/bin_mean_shift.py b/bin_mean_shift.py
index 913b7f5..b7fb278 100644
--- a/bin_mean_shift.py
+++ b/bin_mean_shift.py
@@ -117,7 +117,7 @@ def merge_center(self, seed_point, bandwidth=0.25):
 
         # merge center if distance between two points less than bandwidth
         sorted_intensity, indices = torch.sort(intensity, descending=True)
-        is_center = np.ones(n, dtype=np.bool)
+        is_center = np.ones(n, dtype=bool)
         indices = indices.cpu().numpy()
         center = np.zeros(n, dtype=np.uint8)
 
diff --git a/data_tools/RecordReaderAll.py b/data_tools/RecordReaderAll.py
index 6125919..3985f6a 100644
--- a/data_tools/RecordReaderAll.py
+++ b/data_tools/RecordReaderAll.py
@@ -1,5 +1,6 @@
 # modified from https://github.com/art-programmer/PlaneNet
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
+tf.disable_v2_behavior()
 
 HEIGHT=192
 WIDTH=256
diff --git a/data_tools/convert_tfrecords.py b/data_tools/convert_tfrecords.py
index 7418561..9b40ac7 100644
--- a/data_tools/convert_tfrecords.py
+++ b/data_tools/convert_tfrecords.py
@@ -1,11 +1,13 @@
-import tensorflow as tf
 import numpy as np
 import os
 import argparse
 
 from RecordReaderAll import *
 
-os.environ['CUDA_VISIBLE_DEVICES']=''
+import tensorflow.compat.v1 as tf
+tf.disable_v2_behavior()
+
+#os.environ['CUDA_VISIBLE_DEVICES']='0'
 
 parser = argparse.ArgumentParser()
 parser.add_argument('--input_tfrecords_file', type=str,
@@ -27,14 +29,14 @@
     os.makedirs(output_dir)
 
 if data_type == 'train':
-    file_list = open(output_dir + '/train.txt', 'w')
     output_dir = os.path.join(output_dir, 'train')
-    os.makedirs(output_dir)
+    os.makedirs(output_dir,exist_ok=True)
+    file_list = open(output_dir + '/train.txt', 'w')
     max_num = 50000
 elif data_type == 'val':
-    file_list = open(output_dir + '/val.txt', 'w')
     output_dir = os.path.join(output_dir, 'val')
-    os.makedirs(output_dir)
+    os.makedirs(output_dir, exist_ok=True)
+    file_list = open(output_dir + '/val.txt', 'w')
     max_num = 760
 else:
     print("unsupported data type")
@@ -74,7 +76,7 @@
 
         file_list.write('%d.npz\n' % (i, ))
 
-        if i % 100 == 99: 
+        if i % 1000 == 99: 
             print(i)
 
 file_list.close()
diff --git a/utils/loss.py b/utils/loss.py
index 2b8aba1..788a9ea 100644
--- a/utils/loss.py
+++ b/utils/loss.py
@@ -45,9 +45,10 @@ def hinge_embedding_loss(embedding, num_planes, segmentation, device, t_pull=0.5
     embedding = embedding[0]
     segmentation = segmentation[0]
     embeddings = []
+    # print(segmentation[0, :, :].view(1, h, w))
     # select embedding with segmentation
     for i in range(num_planes):
-        feature = torch.transpose(torch.masked_select(embedding, segmentation[i, :, :].view(1, h, w)).view(c, -1), 0, 1)
+        feature = torch.transpose(torch.masked_select(embedding, segmentation[i, :, :].view(1, h, w).bool()).view(c, -1), 0, 1)
         embeddings.append(feature)
 
     centers = []
@@ -90,8 +91,8 @@ def surface_normal_loss(prediction, surface_normal, valid_region):
         valid_predition = torch.transpose(prediction.view(c, -1), 0, 1)
         valid_surface_normal = torch.transpose(surface_normal.view(c, -1), 0, 1)
     else:
-        valid_predition = torch.transpose(torch.masked_select(prediction, valid_region).view(c, -1), 0, 1)
-        valid_surface_normal = torch.transpose(torch.masked_select(surface_normal, valid_region).view(c, -1), 0, 1)
+        valid_predition = torch.transpose(torch.masked_select(prediction, valid_region.bool()).view(c, -1), 0, 1)
+        valid_surface_normal = torch.transpose(torch.masked_select(surface_normal, valid_region.bool()).view(c, -1), 0, 1)
 
     similarity = torch.nn.functional.cosine_similarity(valid_predition, valid_surface_normal, dim=1)
 
@@ -107,8 +108,8 @@ def parameter_loss(prediction, param, valid_region):
         valid_predition = torch.transpose(prediction.view(c, -1), 0, 1)
         valid_param = torch.transpose(param.view(c, -1), 0, 1)
     else:
-        valid_predition = torch.transpose(torch.masked_select(prediction, valid_region).view(c, -1), 0, 1)
-        valid_param = torch.transpose(torch.masked_select(param, valid_region).view(c, -1), 0, 1)
+        valid_predition = torch.transpose(torch.masked_select(prediction, valid_region.bool()).view(c, -1), 0, 1)
+        valid_param = torch.transpose(torch.masked_select(param, valid_region.bool()).view(c, -1), 0, 1)
 
     return torch.mean(torch.sum(torch.abs(valid_predition - valid_param), dim=1))
 
diff --git a/utils/metric.py b/utils/metric.py
index 9bde5dc..a1661ac 100644
--- a/utils/metric.py
+++ b/utils/metric.py
@@ -14,8 +14,8 @@ def eval_iou(annotation,segmentation):
 
     """
 
-    annotation   = annotation.astype(np.bool)
-    segmentation = segmentation.astype(np.bool)
+    annotation   = annotation.astype(bool)
+    segmentation = segmentation.astype(bool)
 
     if np.isclose(np.sum(annotation),0) and np.isclose(np.sum(segmentation),0):
         return 1

From 926e2b37aa21b23b46031a4927939e9384223d13 Mon Sep 17 00:00:00 2001
From: Hassan Sarwat <hassansarwat96@gmail.com>
Date: Thu, 1 Feb 2024 19:36:36 +0100
Subject: [PATCH 2/9] Added dpt

---
 main.py          |  3 ++-
 poetry.lock      |  7 +++++++
 pyproject.toml   | 15 +++++++++++++++
 requirements.txt |  6 +++---
 4 files changed, 27 insertions(+), 4 deletions(-)
 create mode 100644 poetry.lock
 create mode 100644 pyproject.toml

diff --git a/main.py b/main.py
index 893c7d4..30810e6 100644
--- a/main.py
+++ b/main.py
@@ -187,7 +187,8 @@ def train(_run, _log):
     random.seed(cfg.seed)
 
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
+    print('Device:',device)
+    print('*'*100)
     if not (_run._id is None):
         checkpoint_dir = os.path.join(_run.observers[0].basedir, str(_run._id), 'checkpoints')
         if not os.path.exists(checkpoint_dir):
diff --git a/poetry.lock b/poetry.lock
new file mode 100644
index 0000000..b6be44c
--- /dev/null
+++ b/poetry.lock
@@ -0,0 +1,7 @@
+# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
+package = []
+
+[metadata]
+lock-version = "2.0"
+python-versions = "3.8.10"
+content-hash = "ff0abf5ff0eeff0cf2570180eaa9c41878150c7126d7d1437ee02139947f66e9"
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..95d776d
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,15 @@
+[tool.poetry]
+name = "ml3d"
+version = "0.1.0"
+description = "ml3d project"
+authors = ["Your Name <you@example.com>"]
+license = "n"
+readme = "README.md"
+
+[tool.poetry.dependencies]
+python = "3.8.10"
+
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
diff --git a/requirements.txt b/requirements.txt
index 247c99b..f85d1c2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
-torch==0.4.1
-torchvision==0.2.0
-tensorflow==1.14.0
+# torch==0.4.1
+# torchvision==0.2.0
+# tensorflow==1.14.0
 imageio
 scipy
 argparse

From 5f4ef949e36a72536161d83bd85192edb6c8fbf2 Mon Sep 17 00:00:00 2001
From: Hassan Sarwat <hassansarwat96@gmail.com>
Date: Thu, 1 Feb 2024 19:38:11 +0100
Subject: [PATCH 3/9] added dpt

---
 configs/config.yaml     | 12 +++++++----
 main.py                 | 43 ++++++++++++++++++++--------------------
 models/baseline_same.py | 44 +++++++++++++++++++++++++++++++++--------
 3 files changed, 65 insertions(+), 34 deletions(-)

diff --git a/configs/config.yaml b/configs/config.yaml
index 470671c..3f19508 100644
--- a/configs/config.yaml
+++ b/configs/config.yaml
@@ -1,8 +1,9 @@
 seed: 123
 num_gpus: 1
-num_epochs: 100
-resume_dir: None
-print_interval: 10
+num_epochs: 5
+resume_dir: /cluster/52/sarwath/snet/output/models/
+print_interval: 100
+
 
 solver:
   method: adam
@@ -10,7 +11,7 @@ solver:
   weight_decay: 0.00001
 
 dataset:
-  root_dir: /new_disk2/yuzh/PlaneNetData/
+  root_dir: /cluster/52/sarwath/snet/output/processed/
   batch_size: 16
   num_workers: 8
 
@@ -19,3 +20,6 @@ model:
   pretrained: True
   embed_dims: 2
   fix_bn: False
+  name: baseline_4
+  dpt: False
+  semantic: False
diff --git a/main.py b/main.py
index 30810e6..cc86a2d 100644
--- a/main.py
+++ b/main.py
@@ -181,25 +181,27 @@ def load_dataset(subset, cfg):
 @ex.command
 def train(_run, _log):
     cfg = edict(_run.config)
-
+    checkpoint_dir = cfg.resume_dir 
+    model_name = cfg.model.name
     torch.manual_seed(cfg.seed)
     np.random.seed(cfg.seed)
     random.seed(cfg.seed)
-
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    print('Device:',device)
-    print('*'*100)
-    if not (_run._id is None):
-        checkpoint_dir = os.path.join(_run.observers[0].basedir, str(_run._id), 'checkpoints')
-        if not os.path.exists(checkpoint_dir):
-            os.makedirs(checkpoint_dir)
+    # print('Device:',device)
+    # print('*'*100)
+    # if not (_run._id is None):
+    #     checkpoint_dir = os.path.join(_run.observers[0].basedir, str(_run._id), 'checkpoints')
+    #     if not os.path.exists(checkpoint_dir):
+    #         os.makedirs(checkpoint_dir)
+    #     print(checkpoint_dir)
+    #     print('_-_'*80)
 
     # build network
     network = UNet(cfg.model)
-
-    if not (cfg.resume_dir == 'None'):
-        model_dict = torch.load(cfg.resume_dir, map_location=lambda storage, loc: storage)
-        network.load_state_dict(model_dict)
+    
+    # if not (cfg.resume_dir == 'None'):
+    #     model_dict = torch.load(cfg.resume_dir, map_location=lambda storage, loc: storage)
+    #     network.load_state_dict(model_dict)
 
     # load nets into gpu
     if cfg.num_gpus > 1 and torch.cuda.is_available():
@@ -356,9 +358,9 @@ def train(_run, _log):
         history['rmses'].append(rmses.avg)
 
         # save checkpoint
-        if not (_run._id is None):
-            torch.save(network.state_dict(), os.path.join(checkpoint_dir, f"network_epoch_{epoch}.pt"))
-            pickle.dump(history, open(os.path.join(checkpoint_dir, 'history.pkl'), 'wb'))
+        # if not (_run._id is None):
+    torch.save(network.state_dict(), os.path.join(checkpoint_dir, f"{model_name}.pt"))
+    pickle.dump(history, open(os.path.join(checkpoint_dir, 'history.pkl'), 'wb'))
 
 
 @ex.command
@@ -371,17 +373,14 @@ def eval(_run, _log):
 
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
-    if not (_run._id is None):
-        checkpoint_dir = os.path.join('experiments', str(_run._id), 'checkpoints')
-        if not os.path.exists(checkpoint_dir):
-            os.makedirs(checkpoint_dir)
+    checkpoint_dir = cfg.resume_dir 
+    model_name = cfg.model.name
 
     # build network
     network = UNet(cfg.model)
 
-    if not (cfg.resume_dir == 'None'):
-        model_dict = torch.load(cfg.resume_dir, map_location=lambda storage, loc: storage)
-        network.load_state_dict(model_dict)
+    model_dict = torch.load('/cluster/52/sarwath/snet/output/models/baseline_4.pt', map_location=lambda storage, loc: storage)
+    network.load_state_dict(model_dict)
 
     # load nets into gpu
     if cfg.num_gpus > 1 and torch.cuda.is_available():
diff --git a/models/baseline_same.py b/models/baseline_same.py
index 22be26b..2f99508 100644
--- a/models/baseline_same.py
+++ b/models/baseline_same.py
@@ -2,6 +2,8 @@
 import torch.nn as nn
 
 from models import resnet_scene as resnet
+from transformers import DPTFeatureExtractor as dpt
+
 
 
 class ResNet(nn.Module):
@@ -38,19 +40,20 @@ def forward(self, x):
         x4 = self.layer3(x3)
         x5 = self.layer4(x4)
 
-        return x1, x2, x3, x4, x5
-
+        return x1, x2, x3, x4, x5 
 
 class Baseline(nn.Module):
     def __init__(self, cfg):
         super(Baseline, self).__init__()
-
+        self.feature_extractor = dpt.from_pretrained("Intel/dpt-large")
+        self.dpt = cfg.dpt
         orig_resnet = resnet.__dict__[cfg.arch](pretrained=cfg.pretrained)
         self.backbone = ResNet(orig_resnet)
+        self.device =  torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
         self.relu = nn.ReLU(inplace=True)
 
-        channel = 64
+        channel = 3 if cfg.dpt else 64
         # top down
         self.upsample = nn.Upsample(scale_factor=2, mode='bilinear')
         self.up_conv5 = nn.Conv2d(channel, channel, (1, 1))
@@ -96,11 +99,34 @@ def top_down(self, x):
         return p0, p1, p2, p3, p4, p5
 
     def forward(self, x):
-        # bottom up
-        c1, c2, c3, c4, c5 = self.backbone(x)
 
-        # top down
-        p0, p1, p2, p3, p4, p5 = self.top_down((c1, c2, c3, c4, c5))
+        # Garbage, can delete
+        # print(x.size())
+        # print('11'*111)
+        # print(x[0])
+        # feature_extractor = dpt.from_pretrained("Intel/dpt-large")
+        # test = self.feature_extractor(x,do_resize=False,return_tensors='pt')
+        # print(test.data.keys())
+        # print('x'*100)
+        # print(test.data['pixel_values'][0])
+        # print('00'*111)
+        # print(test.data['pixel_values'].size())
+        # print('00'*111)
+
+        if self.dpt:
+            p0 = self.feature_extractor(x, do_resize=False, return_tensors='pt').data['pixel_values'].to(self.device)
+        else:
+            # bottom up
+            c1, c2, c3, c4, c5 = self.backbone(x)
+            # print('_'*100)
+            # print(c1.size(),'_',c2.size(),'_',c3.size(),'_',c4.size(),'_',c5.size())
+            # print('..'*100)
+
+            # top down
+            p0, p1, p2, p3, p4, p5 = self.top_down((c1, c2, c3, c4, c5))
+            # print(p0.size(),'_',p1.size(),'_',p2.size(),'_',p3.size(),'_',p4.size(),'_',p5.size())
+            # print('='*100)
+            
 
         # output
         prob = self.pred_prob(p0)
@@ -110,3 +136,5 @@ def forward(self, x):
         param = self.pred_param(p0)
 
         return prob, embedding, depth, surface_normal, param
+
+    

From 74b65eb30f72109e8e12e343e46fd9d4e8ab71d9 Mon Sep 17 00:00:00 2001
From: Hassan Sarwat <hassansarwat96@gmail.com>
Date: Tue, 6 Feb 2024 22:09:58 +0100
Subject: [PATCH 4/9] Modularized, updated dpt, added semantic loss

---
 configs/config.yaml     |   8 +-
 main.py                 | 116 +++++++++++++++------
 models/baseline_same.py | 216 +++++++++++++++++++++++++++++++++++++---
 predict.py              |   2 +-
 utils/loss.py           |  10 ++
 5 files changed, 304 insertions(+), 48 deletions(-)

diff --git a/configs/config.yaml b/configs/config.yaml
index 3f19508..c12c8ee 100644
--- a/configs/config.yaml
+++ b/configs/config.yaml
@@ -2,7 +2,7 @@ seed: 123
 num_gpus: 1
 num_epochs: 5
 resume_dir: /cluster/52/sarwath/snet/output/models/
-print_interval: 100
+print_interval: 10
 
 
 solver:
@@ -16,10 +16,8 @@ dataset:
   num_workers: 8
 
 model:
-  arch: resnet101
+  arch: resnet101 #  dpt #  
   pretrained: True
   embed_dims: 2
   fix_bn: False
-  name: baseline_4
-  dpt: False
-  semantic: False
+  semantic: True
diff --git a/main.py b/main.py
index cc86a2d..6be75d6 100644
--- a/main.py
+++ b/main.py
@@ -24,9 +24,12 @@
 from utils.disp import colors_256 as colors
 from bin_mean_shift import Bin_Mean_Shift
 from modules import get_coordinate_map
-from utils.loss import Q_loss
+from utils.loss import Q_loss, semantic_loss
 from instance_parameter_loss import InstanceParameterLoss
 from match_segmentation import MatchSegmentation
+from transformers import DPTImageProcessor
+
+image_processor = DPTImageProcessor().from_pretrained('Intel/dpt-large-ade')
 
 ex = Experiment()
 
@@ -114,10 +117,13 @@ def __getitem__(self, index):
 
         image = data['image']
         image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        
+        resized_image = cv2.resize(src=image, dsize=(256,256))
         image = Image.fromarray(image)
-
+        resized_image = Image.fromarray(resized_image)
         if self.transform is not None:
             image = self.transform(image)
+            resized_image = self.transform(resized_image)
 
         plane = data['plane']
         num_planes = data['num_planes'][0]
@@ -143,18 +149,23 @@ def __getitem__(self, index):
         # since some depth is missing, we use plane to recover those depth following PlaneNet
         gt_depth = data['depth'].reshape(192, 256)
         depth = self.plane2depth(plane_parameters, num_planes, gt_segmentation, gt_depth).reshape(1, 192, 256)
+        gt_semantics = data['semantics']
+        gt_semantics = gt_semantics.astype(float)
+
 
         sample = {
+            'resized_image':resized_image,
             'image': image,
             'num_planes': num_planes,
             'instance': torch.ByteTensor(segmentation),
             # one for planar and zero for non-planar
-            'semantic': 1 - torch.FloatTensor(segmentation[num_planes, :, :]).unsqueeze(0),
+            'planar': 1 - torch.FloatTensor(segmentation[num_planes, :, :]).unsqueeze(0),
             'gt_seg': torch.LongTensor(gt_segmentation),
             'depth': torch.FloatTensor(depth),
             'plane_parameters': torch.FloatTensor(plane_parameters),
             'valid_region': torch.ByteTensor(valid_region.astype(np.uint8)).unsqueeze(0),
-            'plane_instance_parameter': torch.FloatTensor(plane_instance_parameter)
+            'plane_instance_parameter': torch.FloatTensor(plane_instance_parameter),
+            'gt_class': torch.FloatTensor(gt_semantics)
         }
 
         return sample
@@ -177,15 +188,15 @@ def load_dataset(subset, cfg):
 
     return loaders
 
-
 @ex.command
 def train(_run, _log):
     cfg = edict(_run.config)
     checkpoint_dir = cfg.resume_dir 
-    model_name = cfg.model.name
+    # model_name = cfg.model.name
     torch.manual_seed(cfg.seed)
     np.random.seed(cfg.seed)
     random.seed(cfg.seed)
+    model_path = f"{cfg.resume_dir}/baseline_{cfg.model.arch}_semantic.pt"
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     # print('Device:',device)
     # print('*'*100)
@@ -210,7 +221,8 @@ def train(_run, _log):
 
     # set up optimizers
     optimizer = get_optimizer(network.parameters(), cfg.solver)
-
+    if device =='cpu':
+        cfg.dataset.num_workers=4
     # data loader
     data_loader = load_dataset('train', cfg.dataset)
 
@@ -238,39 +250,61 @@ def train(_run, _log):
         rmses = AverageMeter()
         instance_rmses = AverageMeter()
         mean_angles = AverageMeter()
+        losses_semantic = AverageMeter()
+
 
         tic = time.time()
         for iter, sample in enumerate(data_loader):
+            resized_image = sample['resized_image'].to(device)
             image = sample['image'].to(device)
             instance = sample['instance'].to(device)
-            semantic = sample['semantic'].to(device)
+            planar = sample['planar'].to(device)
             gt_depth = sample['depth'].to(device)
             gt_seg = sample['gt_seg'].to(device)
             gt_plane_parameters = sample['plane_parameters'].to(device)
             valid_region = sample['valid_region'].to(device)
             gt_plane_instance_parameter = sample['plane_instance_parameter'].to(device)
+            gt_class = sample['gt_class'].to(device)
 
+            x = image
             # forward pass
-            logit, embedding, _, _, param = network(image)
-
+            if cfg.model.arch=='dpt':
+                x = image_processor(resized_image, do_resize=False, return_tensors='pt')['pixel_values'].to(device)
+            
+            if cfg.model.semantic:
+                logit, embedding, _, _, param, semantic, combi = network(image)
+            else:
+                logit, embedding, _, _, param = network(image)
+
+            # print(semantic)
+            # print('00'*100)
+            # print(combi)
+            # print('1'*100)
+            tempc = embedding
+            if cfg.model.semantic:
+                tempc = combi
             segmentations, sample_segmentations, sample_params, centers, sample_probs, sample_gt_segs = \
-                bin_mean_shift(logit, embedding, param, gt_seg)
+                bin_mean_shift(logit, tempc, param, gt_seg)
 
             # calculate loss
-            loss, loss_pull, loss_push, loss_binary, loss_depth, loss_normal, loss_parameters, loss_pw, loss_instance \
-                = 0., 0., 0., 0., 0., 0., 0., 0., 0.
+            loss, loss_pull, loss_push, loss_binary, loss_depth, loss_normal, loss_parameters, loss_pw, loss_instance, loss_semantic \
+                = 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
             batch_size = image.size(0)
             for i in range(batch_size):
+                _loss_semantic = 0 
                 _loss, _loss_pull, _loss_push = hinge_embedding_loss(embedding[i:i+1], sample['num_planes'][i:i+1],
                                                                      instance[i:i+1], device)
 
-                _loss_binary = class_balanced_cross_entropy_loss(logit[i], semantic[i])
+                _loss_binary = class_balanced_cross_entropy_loss(logit[i], planar[i])
 
                 _loss_normal, mean_angle = surface_normal_loss(param[i:i+1], gt_plane_parameters[i:i+1],
                                                                valid_region[i:i+1])
 
                 _loss_L1 = parameter_loss(param[i:i + 1], gt_plane_parameters[i:i + 1], valid_region[i:i + 1])
                 _loss_depth, rmse, infered_depth = Q_loss(param[i:i+1], k_inv_dot_xy1, gt_depth[i:i+1])
+                   
+                if cfg.model.semantic:
+                     _loss_semantic = semantic_loss(semantic, gt_class, device)
 
                 if segmentations[i] is None:
                     continue
@@ -279,12 +313,13 @@ def train(_run, _log):
                     instance_parameter_loss(segmentations[i], sample_segmentations[i], sample_params[i],
                                             valid_region[i:i+1], gt_depth[i:i+1])
 
-                _loss += _loss_binary + _loss_depth + _loss_normal + _instance_loss + _loss_L1
+                _loss += _loss_binary + _loss_depth + _loss_normal + _instance_loss + _loss_L1 + _loss_semantic
+                
 
                 # planar segmentation iou
                 prob = torch.sigmoid(logit[i])
                 mask = (prob > 0.5).float().cpu().numpy()
-                iou = eval_iou(mask, semantic[i].cpu().numpy())
+                iou = eval_iou(mask, planar[i].cpu().numpy())
                 ioues.update(iou * 100)
                 instance_rmses.update(instance_abs_disntace.item())
                 rmses.update(rmse.item())
@@ -297,6 +332,7 @@ def train(_run, _log):
                 loss_depth += _loss_depth
                 loss_normal += _loss_normal
                 loss_instance += _instance_loss
+                loss_semantic += _loss_semantic
 
             loss /= batch_size
             loss_pull /= batch_size
@@ -305,6 +341,7 @@ def train(_run, _log):
             loss_depth /= batch_size
             loss_normal /= batch_size
             loss_instance /= batch_size
+            loss_semantic /= batch_size
 
             # Backward
             optimizer.zero_grad()
@@ -319,7 +356,7 @@ def train(_run, _log):
             losses_depth.update(loss_depth.item())
             losses_normal.update(loss_normal.item())
             losses_instance.update(loss_instance.item())
-
+            losses_semantic.update(loss_semantic.item())
             # update time
             batch_time.update(time.time() - tic)
             tic = time.time()
@@ -337,7 +374,9 @@ def train(_run, _log):
                           f"AN: {mean_angles.val:.4f} ({mean_angles.avg:.4f}) "
                           f"Depth: {losses_depth.val:.4f} ({losses_depth.avg:.4f}) "
                           f"INSDEPTH: {instance_rmses.val:.4f} ({instance_rmses.avg:.4f}) "
-                          f"RMSE: {rmses.val:.4f} ({rmses.avg:.4f}) ")
+                          f"RMSE: {rmses.val:.4f} ({rmses.avg:.4f}) "
+                          f"Semantic: {losses_semantic.val:.4f}({losses_semantic.avg:.4f}) ")
+
 
         _log.info(f"* epoch: {epoch:2d}\t"
                   f"Loss: {losses.avg:.6f}\t"
@@ -346,7 +385,8 @@ def train(_run, _log):
                   f"Binary: {losses_binary.avg:.6f}\t"
                   f"Depth: {losses_depth.avg:.6f}\t"
                   f"IoU: {ioues.avg:.2f}\t"
-                  f"RMSE: {rmses.avg:.4f}\t")
+                  f"RMSE: {rmses.avg:.4f}\t"
+                  f"Semantic: {losses_semantic.avg:.4f}\t")
 
         # save history
         history['losses'].append(losses.avg)
@@ -356,11 +396,13 @@ def train(_run, _log):
         history['losses_depth'].append(losses_depth.avg)
         history['ioues'].append(ioues.avg)
         history['rmses'].append(rmses.avg)
+        history['losses_semantic'].append(losses_semantic.avg)
+
 
         # save checkpoint
         # if not (_run._id is None):
-    torch.save(network.state_dict(), os.path.join(checkpoint_dir, f"{model_name}.pt"))
-    pickle.dump(history, open(os.path.join(checkpoint_dir, 'history.pkl'), 'wb'))
+    torch.save(network.state_dict(), model_path)
+    pickle.dump(history, open(os.path.join(checkpoint_dir, 'history_semantic.pkl'), 'wb'))
 
 
 @ex.command
@@ -374,12 +416,12 @@ def eval(_run, _log):
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
     checkpoint_dir = cfg.resume_dir 
-    model_name = cfg.model.name
+    # model_name = cfg.model.name
 
     # build network
     network = UNet(cfg.model)
 
-    model_dict = torch.load('/cluster/52/sarwath/snet/output/models/baseline_4.pt', map_location=lambda storage, loc: storage)
+    model_dict = torch.load('/cluster/52/sarwath/snet/output/models/baseline_dpt.pt', map_location=lambda storage, loc: storage)
     network.load_state_dict(model_dict)
 
     # load nets into gpu
@@ -401,27 +443,41 @@ def eval(_run, _log):
 
     with torch.no_grad():
         for iter, sample in enumerate(data_loader):
+            resized_image = sample['resized_image'].to(device)
             image = sample['image'].to(device)
             instance = sample['instance'].to(device)
             gt_seg = sample['gt_seg'].numpy()
-            semantic = sample['semantic'].to(device)
+            planar = sample['planar'].to(device)
             gt_depth = sample['depth'].to(device)
             # gt_plane_parameters = sample['plane_parameters'].to(device)
             valid_region = sample['valid_region'].to(device)
             gt_plane_num = sample['num_planes'].int()
             # gt_plane_instance_parameter = sample['plane_instance_parameter'].numpy()
+            gt_class = sample['gt_class'].to(device)
+
             
+            x = image
             # forward pass
-            logit, embedding, _, _, param = network(image)
+            if cfg.model.arch=='dpt':
+                x = image_processor(resized_image, do_resize=False, return_tensors='pt')['pixel_values'].to(device)
+            
+            if cfg.model.semantic:
+                logit, embedding, _, _, param, semantic, combi = network(image)
+            else:
+                logit, embedding, _, _, param = network(image)
 
+            tempc = embedding
+            if cfg.model.semantic:
+                tempc = combi
             prob = torch.sigmoid(logit[0])
+            # image = cv2.resize(src=image, dsize=(192,256))
             
             # infer per pixel depth using per pixel plane parameter
             _, _, per_pixel_depth = Q_loss(param, k_inv_dot_xy1, gt_depth)
 
             # fast mean shift
             segmentation, sampled_segmentation, sample_param = bin_mean_shift.test_forward(
-                prob, embedding[0], param, mask_threshold=0.1)
+                prob, tempc[0], param, mask_threshold=0.1)
 
             # since GT plane segmentation is somewhat noise, the boundary of plane in GT is not well aligned, 
             # we thus use avg_pool_2d to smooth the segmentation results
@@ -461,7 +517,7 @@ def eval(_run, _log):
             # visualization and evaluation
             h, w = 192, 256
             image = tensor_to_image(image.cpu()[0])
-            semantic = semantic.cpu().numpy().reshape(h, w)
+            planar = planar.cpu().numpy().reshape(h, w)
             mask = (prob > 0.1).float().cpu().numpy().reshape(h, w)
             gt_seg = gt_seg.reshape(h, w)
             depth = instance_depth.cpu().numpy()[0, 0].reshape(h, w)
@@ -501,8 +557,8 @@ def eval(_run, _log):
             blend_pred = (pred_seg * 0.7 + image * 0.3).astype(np.uint8)
             blend_gt = (gt_seg_image * 0.7 + image * 0.3).astype(np.uint8)
 
-            semantic = cv2.resize((semantic * 255).astype(np.uint8), (w, h))
-            semantic = cv2.cvtColor(semantic, cv2.COLOR_GRAY2BGR)
+            planar = cv2.resize((planar * 255).astype(np.uint8), (w, h))
+            planar = cv2.cvtColor(planar, cv2.COLOR_GRAY2BGR)
 
             mask = cv2.resize((mask * 255).astype(np.uint8), (w, h))
             mask = cv2.cvtColor(mask, cv2.COLOR_GRAY2BGR)
@@ -522,7 +578,7 @@ def eval(_run, _log):
 
             image_1 = np.concatenate((image, pred_seg, gt_seg_image), axis=1)
             image_2 = np.concatenate((image, blend_pred, blend_gt), axis=1)
-            image_3 = np.concatenate((image, mask, semantic), axis=1)
+            image_3 = np.concatenate((image, mask, planar), axis=1)
             image_4 = np.concatenate((depth_diff, depth, gt_depth), axis=1)
             image = np.concatenate((image_1, image_2, image_3, image_4), axis=0)
 
diff --git a/models/baseline_same.py b/models/baseline_same.py
index 2f99508..5c9745c 100644
--- a/models/baseline_same.py
+++ b/models/baseline_same.py
@@ -2,8 +2,11 @@
 import torch.nn as nn
 
 from models import resnet_scene as resnet
-from transformers import DPTFeatureExtractor as dpt
-
+from transformers import DPTModel, DPTConfig, DPTImageProcessor, DPTForSemanticSegmentation
+import collections.abc
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Set, Tuple, Union
 
 
 class ResNet(nn.Module):
@@ -41,19 +44,174 @@ def forward(self, x):
         x5 = self.layer4(x4)
 
         return x1, x2, x3, x4, x5 
+    
+
+    
+# def _get_backbone_hidden_size(config):
+#     if config.backbone_config is not None and config.is_hybrid is False:
+#         return config.backbone_config.hidden_size
+#     else:
+#         return config.hidden_size
+
+    
+# class DPTReassembleLayer(nn.Module):
+#     def __init__(self, config, channels, factor):
+#         super().__init__()
+#         # projection
+#         hidden_size = _get_backbone_hidden_size(config)
+#         self.projection = nn.Conv2d(in_channels=hidden_size, out_channels=channels, kernel_size=1)
+
+#         # up/down sampling depending on factor
+#         if factor > 1:
+#             self.resize = nn.ConvTranspose2d(channels, channels, kernel_size=factor, stride=factor, padding=0)
+#         elif factor == 1:
+#             self.resize = nn.Identity()
+#         elif factor < 1:
+#             # so should downsample
+#             self.resize = nn.Conv2d(channels, channels, kernel_size=3, stride=int(1 / factor), padding=1)
+
+#     def forward(self, hidden_state):
+#         hidden_state = self.projection(hidden_state)
+#         hidden_state = self.resize(hidden_state)
+#         return hidden_state
+    
+# class DPTReassembleStage(nn.Module):
+#     """
+#     This class reassembles the hidden states of the backbone into image-like feature representations at various
+#     resolutions.
+
+#     This happens in 3 stages:
+#     1. Map the N + 1 tokens to a set of N tokens, by taking into account the readout ([CLS]) token according to
+#        `config.readout_type`.
+#     2. Project the channel dimension of the hidden states according to `config.neck_hidden_sizes`.
+#     3. Resizing the spatial dimensions (height, width).
+
+#     Args:
+#         config (`[DPTConfig]`):
+#             Model configuration class defining the model architecture.
+#     """
+
+#     def __init__(self, config):
+#         super().__init__()
+
+#         self.config = config
+#         self.layers = nn.ModuleList()
+#         if config.is_hybrid:
+#             self._init_reassemble_dpt_hybrid(config)
+#         else:
+#             self._init_reassemble_dpt(config)
+
+#         self.neck_ignore_stages = config.neck_ignore_stages
+
+#     def _init_reassemble_dpt_hybrid(self, config):
+#         r""" "
+#         For DPT-Hybrid the first 2 reassemble layers are set to `nn.Identity()`, please check the official
+#         implementation: https://github.com/isl-org/DPT/blob/f43ef9e08d70a752195028a51be5e1aff227b913/dpt/vit.py#L438
+#         for more details.
+#         """
+#         for i, factor in zip(range(len(config.neck_hidden_sizes)), config.reassemble_factors):
+#             if i <= 1:
+#                 self.layers.append(nn.Identity())
+#             elif i > 1:
+#                 self.layers.append(DPTReassembleLayer(config, channels=config.neck_hidden_sizes[i], factor=factor))
+
+#         if config.readout_type != "project":
+#             raise ValueError(f"Readout type {config.readout_type} is not supported for DPT-Hybrid.")
+
+#         # When using DPT-Hybrid the readout type is set to "project". The sanity check is done on the config file
+#         self.readout_projects = nn.ModuleList()
+#         hidden_size = _get_backbone_hidden_size(config)
+#         for i in range(len(config.neck_hidden_sizes)):
+#             if i <= 1:
+#                 self.readout_projects.append(nn.Sequential(nn.Identity()))
+#             elif i > 1:
+#                 self.readout_projects.append(
+#                     nn.Sequential(nn.Linear(2 * hidden_size, hidden_size), ACT2FN[config.hidden_act])
+#                 )
+
+#     def _init_reassemble_dpt(self, config):
+#         for i, factor in zip(range(len(config.neck_hidden_sizes)), config.reassemble_factors):
+#             self.layers.append(DPTReassembleLayer(config, channels=config.neck_hidden_sizes[i], factor=factor))
+
+#         if config.readout_type == "project":
+#             self.readout_projects = nn.ModuleList()
+#             hidden_size = _get_backbone_hidden_size(config)
+#             for _ in range(len(config.neck_hidden_sizes)):
+#                 self.readout_projects.append(
+#                     nn.Sequential(nn.Linear(2 * hidden_size, hidden_size), ACT2FN[config.hidden_act])
+#                 )
+
+#     def forward(self, hidden_states: List[torch.Tensor], patch_height=None, patch_width=None) -> List[torch.Tensor]:
+#         """
+#         Args:
+#             hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`):
+#                 List of hidden states from the backbone.
+#         """
+#         out = []
+
+#         for i, hidden_state in enumerate(hidden_states):
+#             if i not in self.neck_ignore_stages:
+#                 # reshape to (batch_size, num_channels, height, width)
+#                 cls_token, hidden_state = hidden_state[:, 0], hidden_state[:, 1:]
+#                 batch_size, sequence_length, num_channels = hidden_state.shape
+#                 if patch_height is not None and patch_width is not None:
+#                     hidden_state = hidden_state.reshape(batch_size, patch_height, patch_width, num_channels)
+#                 else:
+#                     size = int(math.sqrt(sequence_length))
+#                     hidden_state = hidden_state.reshape(batch_size, size, size, num_channels)
+#                 hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
+
+#                 feature_shape = hidden_state.shape
+#                 if self.config.readout_type == "project":
+#                     # reshape to (batch_size, height*width, num_channels)
+#                     hidden_state = hidden_state.flatten(2).permute((0, 2, 1))
+#                     readout = cls_token.unsqueeze(1).expand_as(hidden_state)
+#                     # concatenate the readout token to the hidden states and project
+#                     hidden_state = self.readout_projects[i](torch.cat((hidden_state, readout), -1))
+#                     # reshape back to (batch_size, num_channels, height, width)
+#                     hidden_state = hidden_state.permute(0, 2, 1).reshape(feature_shape)
+#                 elif self.config.readout_type == "add":
+#                     hidden_state = hidden_state.flatten(2) + cls_token.unsqueeze(-1)
+#                     hidden_state = hidden_state.reshape(feature_shape)
+#                 hidden_state = self.layers[i](hidden_state)
+#             out.append(hidden_state)
+
+#         return out
+
+    
 
 class Baseline(nn.Module):
     def __init__(self, cfg):
         super(Baseline, self).__init__()
-        self.feature_extractor = dpt.from_pretrained("Intel/dpt-large")
-        self.dpt = cfg.dpt
-        orig_resnet = resnet.__dict__[cfg.arch](pretrained=cfg.pretrained)
-        self.backbone = ResNet(orig_resnet)
-        self.device =  torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.arch = cfg.arch
+        self.semantic = cfg.semantic
+        if cfg.arch == 'dpt':
+            self.arch = 'dpt'
+            self.dpt_config = DPTConfig(image_size=256)
+            self.dpt =  DPTForSemanticSegmentation(config = self.dpt_config).from_pretrained("Intel/dpt-large-ade")
+            self.dpt_config = self.dpt.config
+            self.dpt_proj = nn.Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            self.dpt_conv1 = nn.Conv2d(256, 128, kernel_size=3, stride=1, padding=1)
+            self.dpt_up1 = nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True)
+            self.dpt_conv2 = nn.Conv2d(128, 64, kernel_size=3, stride=1, padding=1)
+            self.dpt_maxp = nn.MaxPool2d(kernel_size=(9,1), dilation=(8,1), stride = (1,1)) 
+            self.dpt_relu = nn.ReLU()
 
+            self.dpt_head = nn.Sequential(
+                nn.Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
+                nn.Conv2d(256, 128, kernel_size=3, stride=1, padding=1),
+                nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True),
+                nn.Conv2d(128, 64, kernel_size=3, stride=1, padding=1),
+                nn.MaxPool2d(kernel_size=(9,1), dilation=(8,1), stride = (1,1)) ,
+                nn.ReLU()
+            )
+        else:
+            orig_resnet = resnet.__dict__[cfg.arch](pretrained=cfg.pretrained)
+            self.backbone = ResNet(orig_resnet)
+        self.device =  torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.relu = nn.ReLU(inplace=True)
 
-        channel = 3 if cfg.dpt else 64
+        channel = 64
         # top down
         self.upsample = nn.Upsample(scale_factor=2, mode='bilinear')
         self.up_conv5 = nn.Conv2d(channel, channel, (1, 1))
@@ -72,6 +230,9 @@ def __init__(self, cfg):
 
         self.p0_conv = nn.Conv2d(channel, channel, (3, 3), padding=1)
 
+
+        
+
         # plane or non-plane classifier
         self.pred_prob = nn.Conv2d(channel, 1, (1, 1), padding=0)
         # embedding
@@ -82,6 +243,12 @@ def __init__(self, cfg):
         self.pred_surface_normal = nn.Conv2d(channel, 3, (1, 1), padding=0)
         # surface plane parameters
         self.pred_param = nn.Conv2d(channel, 3, (1, 1), padding=0)
+        
+        if cfg.semantic:
+            # semantic segmentation
+            self.pred_semantic = nn.Conv2d(channel, 41, (1, 1), padding=0)
+            # combination for semantic pool
+            self.combination = nn.Conv2d(43, 2, (1, 1), padding=0)
 
     def top_down(self, x):
         c1, c2, c3, c4, c5 = x
@@ -97,6 +264,25 @@ def top_down(self, x):
         p0 = self.relu(self.p0_conv(p0))
 
         return p0, p1, p2, p3, p4, p5
+    
+    def dpt_backbone(self, x):
+        
+        l1 = self.dpt.dpt(x, output_hidden_states=True, output_attentions=True, return_dict=True)
+        hidden_states = l1.hidden_states
+        # print(l1.shape)
+        # print('dpt_enc'*10)
+        # output_attentions = l1.output_attentions
+        # return_dict = l1.return_dict
+        hidden_states = [ feature for idx, feature in enumerate(hidden_states[1:]) if idx in self.dpt.config.backbone_out_indices]
+        
+        l2 = self.dpt.neck(hidden_states=hidden_states)
+        l3 = l2[-1]
+        # print(l3.shape)
+        # print('dpt_dec_'*20)
+        l4 = self.dpt_head(l3)
+        # print(l4.shape)
+        # print('dpt_head_'*20)
+        return l1,l2,l3,l4
 
     def forward(self, x):
 
@@ -112,9 +298,9 @@ def forward(self, x):
         # print('00'*111)
         # print(test.data['pixel_values'].size())
         # print('00'*111)
-
-        if self.dpt:
-            p0 = self.feature_extractor(x, do_resize=False, return_tensors='pt').data['pixel_values'].to(self.device)
+        
+        if self.arch == 'dpt':
+            p3,p2,p1,p0 = self.dpt_backbone(x)
         else:
             # bottom up
             c1, c2, c3, c4, c5 = self.backbone(x)
@@ -129,12 +315,18 @@ def forward(self, x):
             
 
         # output
+        
         prob = self.pred_prob(p0)
         embedding = self.embedding_conv(p0)
         depth = self.pred_depth(p0)
         surface_normal = self.pred_surface_normal(p0)
         param = self.pred_param(p0)
-
+        
+        if self.semantic:
+            semantic = self.pred_semantic(p0)
+            combination = self.combination(torch.cat((embedding, semantic), dim=1))
+            return prob, embedding, depth, surface_normal, param, semantic, combination
+            
         return prob, embedding, depth, surface_normal, param
 
     
diff --git a/predict.py b/predict.py
index 2cd8baa..9053d92 100644
--- a/predict.py
+++ b/predict.py
@@ -66,7 +66,7 @@ def predict(_run, _log):
         image = transforms(image)
         image = image.to(device).unsqueeze(0)
         # forward pass
-        logit, embedding, _, _, param = network(image)
+        logit, embedding, _, _, param, semantic, combi = network(image)
 
         prob = torch.sigmoid(logit[0])
         
diff --git a/utils/loss.py b/utils/loss.py
index 788a9ea..4d7f91e 100644
--- a/utils/loss.py
+++ b/utils/loss.py
@@ -151,3 +151,13 @@ def Q_loss(param, k_inv_dot_xy1, gt_depth):
     q_diff = torch.abs(torch.sum(valid_param * Q, dim=0, keepdim=True) - 1.)
     loss = torch.mean(q_diff)
     return loss, abs_distance, infered_depth.view(1, 1, h, w)
+
+def semantic_loss(semantic, gt_class,device):
+    b, c, h, w = semantic.size()
+
+    semantic = torch.transpose(semantic.view(c, -1).to(device), 0, 1)
+    gt_class = gt_class.long().view(-1).to(device)
+    loss_func = torch.nn.CrossEntropyLoss().to(device)
+    loss = loss_func(semantic, gt_class)
+    return loss
+

From c1c884f1442eda382884383e92f5381d885cb123 Mon Sep 17 00:00:00 2001
From: Dmitry Fadeev <ge86yew@mytum.de>
Date: Wed, 7 Feb 2024 12:40:55 +0100
Subject: [PATCH 5/9] changes to contrastive loss

---
 configs/config.yaml |  4 +--
 main.py             | 12 ++++---
 utils/loss.py       | 85 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 95 insertions(+), 6 deletions(-)

diff --git a/configs/config.yaml b/configs/config.yaml
index c12c8ee..f49d3b0 100644
--- a/configs/config.yaml
+++ b/configs/config.yaml
@@ -1,7 +1,7 @@
 seed: 123
 num_gpus: 1
 num_epochs: 5
-resume_dir: /cluster/52/sarwath/snet/output/models/
+resume_dir: /Users/dimafadeev/Desktop/Catalog/TUM/WS23/ML3D/repo/output/models/
 print_interval: 10
 
 
@@ -11,7 +11,7 @@ solver:
   weight_decay: 0.00001
 
 dataset:
-  root_dir: /cluster/52/sarwath/snet/output/processed/
+  root_dir: /Users/dimafadeev/Desktop/Catalog/TUM/WS23/ML3D/repo/output/processed/
   batch_size: 16
   num_workers: 8
 
diff --git a/main.py b/main.py
index 6be75d6..df71a9b 100644
--- a/main.py
+++ b/main.py
@@ -17,7 +17,7 @@
 
 from models.baseline_same import Baseline as UNet
 from utils.loss import hinge_embedding_loss, surface_normal_loss, parameter_loss, \
-    class_balanced_cross_entropy_loss
+    class_balanced_cross_entropy_loss, contrastive_loss
 from utils.misc import AverageMeter, get_optimizer
 from utils.metric import eval_iou, eval_plane_prediction
 from utils.disp import tensor_to_image
@@ -291,9 +291,13 @@ def train(_run, _log):
                 = 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
             batch_size = image.size(0)
             for i in range(batch_size):
-                _loss_semantic = 0 
-                _loss, _loss_pull, _loss_push = hinge_embedding_loss(embedding[i:i+1], sample['num_planes'][i:i+1],
-                                                                     instance[i:i+1], device)
+                _loss_semantic = 0
+                # Update with contrastive losee
+                # _loss, _loss_pull, _loss_push = hinge_embedding_loss(embedding[i:i+1], sample['num_planes'][i:i+1],
+                #                                                      instance[i:i+1], device)
+
+                _loss, _loss_pull, _loss_push = contrastive_loss(embedding[i:i + 1], sample['num_planes'][i:i + 1],
+                                                                     instance[i:i + 1], device)
 
                 _loss_binary = class_balanced_cross_entropy_loss(logit[i], planar[i])
 
diff --git a/utils/loss.py b/utils/loss.py
index 4d7f91e..ae5d2c8 100644
--- a/utils/loss.py
+++ b/utils/loss.py
@@ -161,3 +161,88 @@ def semantic_loss(semantic, gt_class,device):
     loss = loss_func(semantic, gt_class)
     return loss
 
+
+def contrastive_loss(embedding, num_planes, segmentation, device, temperature=0.07, base_temperature=0.07):
+    """Args:
+        features: hidden vector of shape [batch_size, num_features].
+        labels: ground truth of shape [batch_size].
+    Returns:
+        A loss scalar.
+    """
+    # logits --> for each pixel, the dot product of its embedding and the mean embedding of each plane
+    # segmentation --> for each pixel, take its num_planes masks (should be one-hot)
+
+    # PROBLEM!! NOT EVERY PIXEL IS FROM A PLANE, THOSE HAVE NO SEGMENTATION AND SHOULD NOT BE ACCOUNTED!
+
+    # print(torch.min(torch.sum(segmentation, dim=0)), torch.max(torch.sum(segmentation, dim=0))) # 1, 1 CHECKED, IT IS 0
+    # print(torch.unique(segmentation.sum(dim=0))) # [0,1]
+
+    # print(logits.shape, segmentation.shape) # num_planes x h*w CHECK
+
+    # print(positive.shape) # num_planes x h*w CHECK
+
+    # GOAL: If positive is 0, that pixel is not from a plane, it has to be discarded from everywhere
+
+    # print(indices, indices.shape)
+    # print(nonzero, len(indices)) # IT'S THE SAME
+
+    b, c, h, w = embedding.size()  # b = 1
+
+    # print(embedding.size()) # 1 x 2 x 192 x 256 CHECK
+
+    # Since it is a single image get rid of first dimension (batch)
+    num_planes = num_planes.numpy()[0]
+    # print(num_planes) # 7 for the first
+    embedding = embedding[0]
+    segmentation = segmentation[0]
+    embeddings = []
+
+    # print(embedding.size()) # 2 x 192 x 256 CHECK
+    nonzero = 0
+    # select embedding with segmentation
+    for i in range(num_planes):  # do not take non-planar region
+        feature = torch.transpose(torch.masked_select(embedding, segmentation[i, :, :].view(1, h, w)).view(c, -1), 0, 1)
+        nonzero += feature.shape[0]
+        # print(feature.shape) # num pixels of plane i x 2 CHECK
+        embeddings.append(feature)
+
+    centers = []
+    for feature in embeddings:
+        center = torch.mean(feature, dim=0).view(1, c)
+        centers.append(center)
+    centers = torch.cat(centers)
+
+    centers = centers.unsqueeze(1)
+    embedding = embedding.view(-1, c).unsqueeze(0)
+    logits = embedding * centers
+    logits = logits.sum(2)  # num_planes x h*w
+
+    segmentation = segmentation[:num_planes, :, :].view(-1, h * w)  # mask each pixel w.r.t. segmentation
+
+    indices = segmentation.sum(dim=0).nonzero()
+
+    # Only take the dot product of the corresponding center
+    positive = logits * segmentation.to(torch.float)
+
+    positive = torch.index_select(positive, 1, indices.squeeze())
+    logits = torch.index_select(logits, 1, indices.squeeze())
+
+    # print(positive.shape) # num_planes x planar pixels
+    # print(logits.shape)
+
+    for i in range(positive.shape[1]):
+        if len(torch.unique(torch.abs(positive[:, i]))) != 2 & num_planes > 1:
+            print('FUCK')
+            print(torch.unique(torch.abs(positive[:, i])))
+        if torch.max(torch.abs(positive[:, i])).item() != torch.sum(torch.abs(positive[:, i])):
+            print('FUCK 2')
+            print(torch.max(torch.abs(positive[:, i])).item(), torch.sum(positive[:, i]), torch.abs(positive[:, i]))
+
+    exp_logits = torch.exp(logits)
+
+    # positive.sum(0) should only be adding 1 number
+    log_prob = positive.sum(0) - torch.log(exp_logits.sum(0, keepdim=True))
+
+    loss = - (temperature / base_temperature) * log_prob
+
+    return torch.mean(loss), torch.mean(loss), torch.tensor(0)

From d4fd451a9338aba6249ce1f70318c6f9d18e14e1 Mon Sep 17 00:00:00 2001
From: Dmitry Fadeev <ge86yew@mytum.de>
Date: Wed, 7 Feb 2024 13:14:29 +0100
Subject: [PATCH 6/9] results contrastive

---
 utils/loss.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/loss.py b/utils/loss.py
index ae5d2c8..afa3f57 100644
--- a/utils/loss.py
+++ b/utils/loss.py
@@ -201,7 +201,7 @@ def contrastive_loss(embedding, num_planes, segmentation, device, temperature=0.
     nonzero = 0
     # select embedding with segmentation
     for i in range(num_planes):  # do not take non-planar region
-        feature = torch.transpose(torch.masked_select(embedding, segmentation[i, :, :].view(1, h, w)).view(c, -1), 0, 1)
+        feature = torch.transpose(torch.masked_select(embedding, segmentation[i, :, :].view(1, h, w).bool()).view(c, -1), 0, 1)
         nonzero += feature.shape[0]
         # print(feature.shape) # num pixels of plane i x 2 CHECK
         embeddings.append(feature)

From d50dd7f7a25d7e43baa9b046ab3ab82a8d4642f1 Mon Sep 17 00:00:00 2001
From: Dmitry Fadeev <ge86yew@mytum.de>
Date: Wed, 7 Feb 2024 15:35:22 +0100
Subject: [PATCH 7/9] training going down

---
 main.py       |  3 ++-
 utils/loss.py | 12 ++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/main.py b/main.py
index df71a9b..655e4e7 100644
--- a/main.py
+++ b/main.py
@@ -228,7 +228,8 @@ def train(_run, _log):
 
     # save losses per epoch
     history = {'losses': [], 'losses_pull': [], 'losses_push': [],
-               'losses_binary': [], 'losses_depth': [], 'ioues': [], 'rmses': []}
+               'losses_binary': [], 'losses_depth': [], 'ioues': [], 'rmses': [],
+               'losses_semantic': []}
 
     network.train(not cfg.model.fix_bn)
 
diff --git a/utils/loss.py b/utils/loss.py
index afa3f57..d6b5437 100644
--- a/utils/loss.py
+++ b/utils/loss.py
@@ -197,6 +197,10 @@ def contrastive_loss(embedding, num_planes, segmentation, device, temperature=0.
     segmentation = segmentation[0]
     embeddings = []
 
+    # Debug print
+    print(f"Batch size: {b}, Channels: {c}, Height: {h}, Width: {w}")
+    print(f"Number of planes: {num_planes}")
+
     # print(embedding.size()) # 2 x 192 x 256 CHECK
     nonzero = 0
     # select embedding with segmentation
@@ -206,6 +210,9 @@ def contrastive_loss(embedding, num_planes, segmentation, device, temperature=0.
         # print(feature.shape) # num pixels of plane i x 2 CHECK
         embeddings.append(feature)
 
+    # Debug print
+    print(f"Non-zero features count: {nonzero}")
+
     centers = []
     for feature in embeddings:
         center = torch.mean(feature, dim=0).view(1, c)
@@ -245,4 +252,9 @@ def contrastive_loss(embedding, num_planes, segmentation, device, temperature=0.
 
     loss = - (temperature / base_temperature) * log_prob
 
+    # Debug print
+    print(f"Logits shape: {logits.shape}, Positive shape: {positive.shape}")
+    print(f"Sample logits: {logits[:5]}, Sample positive: {positive[:5]}")
+    print(f"Loss tensor: {loss}")
+
     return torch.mean(loss), torch.mean(loss), torch.tensor(0)

From f0ea7cc5a40c9708c726a27ccff35e5bcefdd0a9 Mon Sep 17 00:00:00 2001
From: Dmitry Fadeev <ge86yew@mytum.de>
Date: Wed, 7 Feb 2024 23:56:03 +0100
Subject: [PATCH 8/9] added new file

---
 embedding.pt | 3 +++
 main.py      | 1 +
 2 files changed, 4 insertions(+)
 create mode 100644 embedding.pt

diff --git a/embedding.pt b/embedding.pt
new file mode 100644
index 0000000..ac35f5c
--- /dev/null
+++ b/embedding.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4137e5d2d5aacbec248e9c090cf41fa828ec1395dd0b4d5c74d5d849127607e0
+size 1574054
diff --git a/main.py b/main.py
index 655e4e7..0764413 100644
--- a/main.py
+++ b/main.py
@@ -407,6 +407,7 @@ def train(_run, _log):
         # save checkpoint
         # if not (_run._id is None):
     torch.save(network.state_dict(), model_path)
+    torch.save(embedding, 'embedding.pt')
     pickle.dump(history, open(os.path.join(checkpoint_dir, 'history_semantic.pkl'), 'wb'))
 
 

From 8eeca2f015b32cfa639b31824958ee655100bac2 Mon Sep 17 00:00:00 2001
From: Dmitry Fadeev <ge86yew@mytum.de>
Date: Sat, 10 Feb 2024 10:50:48 +0100
Subject: [PATCH 9/9] trained after 2 days

---
 embedding.pt        |  4 ++--
 instance.pt         |  3 +++
 main.py             |  1 +
 utils/loss.py       | 12 ++++++------
 utils/subset_npz.py | 31 +++++++++++++++++++++++++++++++
 5 files changed, 43 insertions(+), 8 deletions(-)
 create mode 100644 instance.pt
 create mode 100644 utils/subset_npz.py

diff --git a/embedding.pt b/embedding.pt
index ac35f5c..516c774 100644
--- a/embedding.pt
+++ b/embedding.pt
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4137e5d2d5aacbec248e9c090cf41fa828ec1395dd0b4d5c74d5d849127607e0
-size 1574054
+oid sha256:a8f7ce8431ff6498653ff87fb86ecb7d14bfa424f29b32e04132ea66ffd9b7b9
+size 6292646
diff --git a/instance.pt b/instance.pt
new file mode 100644
index 0000000..acc5d85
--- /dev/null
+++ b/instance.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:29cedc5d821c9d5793bba67f1332436ff9e72e03a1e3ab08a785b6ef78e9e51c
+size 16516257
diff --git a/main.py b/main.py
index 0764413..51907c3 100644
--- a/main.py
+++ b/main.py
@@ -408,6 +408,7 @@ def train(_run, _log):
         # if not (_run._id is None):
     torch.save(network.state_dict(), model_path)
     torch.save(embedding, 'embedding.pt')
+    torch.save(instance, 'instance.pt')
     pickle.dump(history, open(os.path.join(checkpoint_dir, 'history_semantic.pkl'), 'wb'))
 
 
diff --git a/utils/loss.py b/utils/loss.py
index d6b5437..0689ca1 100644
--- a/utils/loss.py
+++ b/utils/loss.py
@@ -198,8 +198,8 @@ def contrastive_loss(embedding, num_planes, segmentation, device, temperature=0.
     embeddings = []
 
     # Debug print
-    print(f"Batch size: {b}, Channels: {c}, Height: {h}, Width: {w}")
-    print(f"Number of planes: {num_planes}")
+    # print(f"Batch size: {b}, Channels: {c}, Height: {h}, Width: {w}")
+    # print(f"Number of planes: {num_planes}")
 
     # print(embedding.size()) # 2 x 192 x 256 CHECK
     nonzero = 0
@@ -210,8 +210,8 @@ def contrastive_loss(embedding, num_planes, segmentation, device, temperature=0.
         # print(feature.shape) # num pixels of plane i x 2 CHECK
         embeddings.append(feature)
 
-    # Debug print
-    print(f"Non-zero features count: {nonzero}")
+    # # Debug print
+    # print(f"Non-zero features count: {nonzero}")
 
     centers = []
     for feature in embeddings:
@@ -253,8 +253,8 @@ def contrastive_loss(embedding, num_planes, segmentation, device, temperature=0.
     loss = - (temperature / base_temperature) * log_prob
 
     # Debug print
-    print(f"Logits shape: {logits.shape}, Positive shape: {positive.shape}")
-    print(f"Sample logits: {logits[:5]}, Sample positive: {positive[:5]}")
+    # print(f"Logits shape: {logits.shape}, Positive shape: {positive.shape}")
+    # print(f"Sample logits: {logits[:5]}, Sample positive: {positive[:5]}")
     print(f"Loss tensor: {loss}")
 
     return torch.mean(loss), torch.mean(loss), torch.tensor(0)
diff --git a/utils/subset_npz.py b/utils/subset_npz.py
new file mode 100644
index 0000000..7be0ddd
--- /dev/null
+++ b/utils/subset_npz.py
@@ -0,0 +1,31 @@
+import shutil
+import os
+
+def main():
+    # Path to the directory containing all .npz files
+    npz_directory = '/Users/dimafadeev/Desktop/Catalog/TUM/WS23/ML3D/repo/processed_data/train'
+
+    # Path to the .txt file containing the list of files to subset
+    txt_file_path = '/Users/dimafadeev/Desktop/Catalog/TUM/WS23/ML3D/repo/processed_data/train.txt'
+
+    # Path to the directory where you want to save the subset
+    subset_directory = '/Users/dimafadeev/Desktop/Catalog/TUM/WS23/ML3D/repo/processed_data/train_subset'
+
+    # Make sure the subset directory exists
+    os.makedirs(subset_directory, exist_ok=True)
+
+    # Read the list of .npz file names from the .txt file
+    with open(txt_file_path, 'r') as file:
+        subset_files = [line.strip() for line in file]
+
+    # Copy the subset .npz files
+    for file_name in subset_files:
+        full_file_path = os.path.join(npz_directory, file_name)
+        if os.path.isfile(full_file_path):
+            # Copy the file to the subset directory
+            shutil.copy(full_file_path, subset_directory)
+        else:
+            print(f"File {file_name} not found in the npz directory.")
+
+if __name__ == "__main__":
+    main()