Merge pull request #4 from Capstone-Projects-2024-Spring/zhu

Added Imagebind
Capstone-Projects-2024-Spring · Mar 28, 2024 · 345f061 · 345f061
2 parents 3032e7c + 60e14ea
commit 345f061
Show file tree

Hide file tree

Showing 31 changed files with 4,056 additions and 2 deletions.
diff --git a/.assets/bird_audio.wav b/.assets/bird_audio.wav
diff --git a/.assets/bird_image.jpg b/.assets/bird_image.jpg
diff --git a/.assets/car_audio.wav b/.assets/car_audio.wav
diff --git a/.assets/car_image.jpg b/.assets/car_image.jpg
diff --git a/.assets/dog_audio.wav b/.assets/dog_audio.wav
diff --git a/.assets/dog_image.jpg b/.assets/dog_image.jpg
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.idea/project-waveease.iml b/.idea/project-waveease.iml
diff --git a/HelloWorld.py b/HelloWorld.py
@@ -0,0 +1,55 @@
+from imagebind import data
+import torch
+from imagebind.models import imagebind_model
+from imagebind.models.imagebind_model import ModalityType
+
+text_list=["A dog.", "A car", "A bird"]
+image_paths=[".assets/dog_image.jpg", ".assets/car_image.jpg", ".assets/bird_image.jpg"]
+audio_paths=[".assets/dog_audio.wav", ".assets/car_audio.wav", ".assets/bird_audio.wav"]
+
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+
+# Instantiate model
+model = imagebind_model.imagebind_huge(pretrained=True)
+model.eval()
+model.to(device)
+
+# Load data
+inputs = {
+    ModalityType.TEXT: data.load_and_transform_text(text_list, device),
+    ModalityType.VISION: data.load_and_transform_vision_data(image_paths, device),
+    ModalityType.AUDIO: data.load_and_transform_audio_data(audio_paths, device),
+}
+
+with torch.no_grad():
+    embeddings = model(inputs)
+
+print(
+    "Vision x Text: ",
+    torch.softmax(embeddings[ModalityType.VISION] @ embeddings[ModalityType.TEXT].T, dim=-1),
+)
+print(
+    "Audio x Text: ",
+    torch.softmax(embeddings[ModalityType.AUDIO] @ embeddings[ModalityType.TEXT].T, dim=-1),
+)
+print(
+    "Vision x Audio: ",
+    torch.softmax(embeddings[ModalityType.VISION] @ embeddings[ModalityType.AUDIO].T, dim=-1),
+)
+
+# Expected output:
+#
+# Vision x Text:
+# tensor([[9.9761e-01, 2.3694e-03, 1.8612e-05],
+#         [3.3836e-05, 9.9994e-01, 2.4118e-05],
+#         [4.7997e-05, 1.3496e-02, 9.8646e-01]])
+#
+# Audio x Text:
+# tensor([[1., 0., 0.],
+#         [0., 1., 0.],
+#         [0., 0., 1.]])
+#
+# Vision x Audio:
+# tensor([[0.8070, 0.1088, 0.0842],
+#         [0.1036, 0.7884, 0.1079],
+#         [0.0018, 0.0022, 0.9960]])
diff --git a/bpe/bpe_simple_vocab_16e6.txt.gz b/bpe/bpe_simple_vocab_16e6.txt.gz
diff --git a/build/lib/imagebind/__init__.py b/build/lib/imagebind/__init__.py
@@ -0,0 +1,3 @@
+from imagebind import data
+from imagebind.models import imagebind_model
+from imagebind.models.imagebind_model import ModalityType