huggingface · merveenoyan · Feb 28, 2025 · Feb 26, 2025 · Feb 26, 2025 · Feb 26, 2025
@@ -41,7 +41,7 @@ const taskData: TaskDataCustom = {
 		},
 		{
 			description: "A robust depth estimation model.",
-			id: "apple/DepthPro",
+			id: "apple/DepthPro-hf",
 		},
 	],
 	spaces: [

@@ -48,7 +48,7 @@ const taskData: TaskDataCustom = {
 		},
 		{
 			description: "A screenshot understanding model used to control computers.",
-			id: "showlab/ShowUI-2B",
+			id: "microsoft/OmniParser-v2.0",
 		},
 		{
 			description: "Cutting-edge vision language model.",
@@ -63,12 +63,16 @@ const taskData: TaskDataCustom = {
 			id: "Qwen/Qwen2.5-VL-7B-Instruct",
 		},
 		{
-			description: "Image-text-to-text model with reasoning capabilities.",
-			id: "Qwen/QVQ-72B-Preview",
+			description: "Image-text-to-text model with agentic capabilities.",
+			id: "microsoft/Magma-8B",
 		},
 		{
 			description: "Strong image-text-to-text model focused on documents.",
-			id: "stepfun-ai/GOT-OCR2_0",
+			id: "allenai/olmOCR-7B-0225-preview",
+		},
+		{
+			description: "Small yet strong image-text-to-text model.",
+			id: "ibm-granite/granite-vision-3.2-2b",
 		},
 	],
 	spaces: [
@@ -85,8 +89,8 @@ const taskData: TaskDataCustom = {
 			id: "akhaliq/Molmo-7B-D-0924",
 		},
 		{
-			description: "An image-text-to-text application focused on documents.",
-			id: "stepfun-ai/GOT_official_online_demo",
+			description: "Powerful vision language assistant that can understand multiple images.",
+			id: "HuggingFaceTB/SmolVLM2",
 		},
 		{
 			description: "An application for chatting with an image-text-to-text model.",

@@ -27,6 +27,10 @@ const taskData: TaskDataCustom = {
 			description: "A robust keypoint detection model.",
 			id: "magic-leap-community/superpoint",
 		},
+		{
+			description: "A robust keypoint matching model.",
+			id: "magic-leap-community/superglue_outdoor",
+		},
 		{
 			description: "Strong keypoint detection model used to detect human pose.",
 			id: "facebook/sapiens-pose-1b",

@@ -47,12 +47,12 @@ const taskData: TaskDataCustom = {
 			id: "facebook/detr-resnet-50",
 		},
 		{
-			description: "Real-time and accurate object detection model.",
-			id: "jameslahm/yolov10x",
+			description: "Accurate object detection model.",
+			id: "IDEA-Research/dab-detr-resnet-50",
 		},
 		{
-			description: "Fast and accurate object detection model trained on COCO and Object365 datasets.",
-			id: "PekingU/rtdetr_r18vd_coco_o365",
+			description: "Fast and accurate object detection model.",
+			id: "PekingU/rtdetr_v2_r50vd",
 		},
 		{
 			description: "Object detection model for low-lying objects.",
@@ -70,7 +70,7 @@ const taskData: TaskDataCustom = {
 		},
 		{
 			description: "A cutting-edge object detection application.",
-			id: "Ultralytics/YOLO11",
+			id: "sunsmarterjieleaf/yolov12",
 		},
 		{
 			description: "An object tracking, segmentation and inpainting application.",

@@ -76,7 +76,7 @@ const taskData: TaskDataCustom = {
 		},
 		{
 			description: "A very powerful model with reasoning capabilities.",
-			id: "PowerInfer/SmallThinker-3B-Preview",
+			id: "simplescaling/s1.1-32B",
 		},
 		{
 			description: "Strong conversational model that supports very long instructions.",

@@ -76,6 +76,10 @@ const taskData: TaskDataCustom = {
 			description: "An application that synthesizes emotional speech for diverse speaker prompts.",
 			id: "parler-tts/parler-tts-expresso",
 		},
+		{
+			description: "An application that generates podcast episodes.",
+			id: "ngxson/kokoro-podcast-generator",
+		},
 	],
 	summary:
 		"Text-to-Speech (TTS) is the task of generating natural sounding speech given text input. TTS models can be extended to have a single model that generates speech for multiple speakers and multiple languages.",

@@ -78,6 +78,10 @@ const taskData: TaskDataCustom = {
 			description: "A text-to-video model focusing on physics-aware applications like robotics.",
 			id: "nvidia/Cosmos-1.0-Diffusion-7B-Text2World",
 		},
+		{
+			description: "A robust model for video generation.",
+			id: "Wan-AI/Wan2.1-T2V-1.3B",
+		},
 	],
 	spaces: [
 		{
@@ -86,7 +90,7 @@ const taskData: TaskDataCustom = {
 		},
 		{
 			description: "Consistent video generation application.",
-			id: "TIGER-Lab/T2V-Turbo-V2",
+			id: "Wan-AI/Wan2.1",
 		},
 		{
 			description: "A cutting edge video generation application.",

@@ -46,6 +46,10 @@ const taskData: TaskDataCustom = {
 			description: "Strong video-text-to-text model with reasoning capabilities.",
 			id: "GoodiesHere/Apollo-LMMs-Apollo-7B-t32",
 		},
+		{
+			description: "Strong video-text-to-text model.",
+			id: "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
+		},
 	],
 	spaces: [
 		{
@@ -56,6 +60,10 @@ const taskData: TaskDataCustom = {
 			description: "A leaderboard for various video-text-to-text models.",
 			id: "opencompass/openvlm_video_leaderboard",
 		},
+		{
+			description: "An application to generate highlights from a video.",
+			id: "HuggingFaceTB/SmolVLM2-HighlightGenerator",
+		},
 	],
 	summary:
 		"Video-text-to-text models take in a video and a text prompt and output text. These models are also called video-language models.",

@@ -60,6 +60,10 @@ const taskData: TaskDataCustom = {
 			description: "Cutting-edge zero-shot multilingual text classification model.",
 			id: "MoritzLaurer/ModernBERT-large-zeroshot-v2.0",
 		},
+		{
+			description: "Zero-shot text classification model that can be used for topic and sentiment classification.",
+			id: "knowledgator/gliclass-modern-base-v2.0-init",
+		},
 	],
 	spaces: [],
 	summary:

@@ -53,11 +53,11 @@ const taskData: TaskDataCustom = {
 		},
 		{
 			description: "Strong zero-shot image classification model.",
-			id: "google/siglip-so400m-patch14-224",
+			id: "google/siglip2-base-patch16-224",
 		},
 		{
 			description: "Robust zero-shot image classification model.",
-			id: "microsoft/LLM2CLIP-EVA02-L-14-336",
+			id: "intfloat/mmE5-mllama-11b-instruct",
 		},
 		{
 			description: "Powerful zero-shot image classification model supporting 94 languages.",