From 447147a25b5c1e5cf910e5fad3f5ed38de3344ee Mon Sep 17 00:00:00 2001 From: Ross Wightman Date: Mon, 23 Dec 2024 16:52:08 -0800 Subject: [PATCH] Switching to timm specific weight instances for open_clip image encoders to facilitate hf-hub: use in timm and new transformers TimmWrapper --- timm/models/convnext.py | 30 ++++++----------- timm/models/vision_transformer.py | 56 ++++++++++++------------------- 2 files changed, 32 insertions(+), 54 deletions(-) diff --git a/timm/models/convnext.py b/timm/models/convnext.py index e682379f64..a6d1999bde 100644 --- a/timm/models/convnext.py +++ b/timm/models/convnext.py @@ -916,53 +916,43 @@ def _cfgv2(url='', **kwargs): # CLIP original image tower weights 'convnext_base.clip_laion2b': _cfg( - hf_hub_id='laion/CLIP-convnext_base_w-laion2B-s13B-b82K', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=640), 'convnext_base.clip_laion2b_augreg': _cfg( - hf_hub_id='laion/CLIP-convnext_base_w-laion2B-s13B-b82K-augreg', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=640), 'convnext_base.clip_laiona': _cfg( - hf_hub_id='laion/CLIP-convnext_base_w-laion_aesthetic-s13B-b82K', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=640), 'convnext_base.clip_laiona_320': _cfg( - hf_hub_id='laion/CLIP-convnext_base_w_320-laion_aesthetic-s13B-b82K', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 320, 320), pool_size=(10, 10), crop_pct=1.0, num_classes=640), 'convnext_base.clip_laiona_augreg_320': _cfg( - hf_hub_id='laion/CLIP-convnext_base_w_320-laion_aesthetic-s13B-b82K-augreg', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 320, 320), pool_size=(10, 10), crop_pct=1.0, num_classes=640), 'convnext_large_mlp.clip_laion2b_augreg': _cfg( - hf_hub_id='laion/CLIP-convnext_large_d.laion2B-s26B-b102K-augreg', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=768), 'convnext_large_mlp.clip_laion2b_ft_320': _cfg( - hf_hub_id='laion/CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 320, 320), pool_size=(10, 10), crop_pct=1.0, num_classes=768), 'convnext_large_mlp.clip_laion2b_ft_soup_320': _cfg( - hf_hub_id='laion/CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft-soup', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 320, 320), pool_size=(10, 10), crop_pct=1.0, num_classes=768), 'convnext_xxlarge.clip_laion2b_soup': _cfg( - hf_hub_id='laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-soup', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=1024), 'convnext_xxlarge.clip_laion2b_rewind': _cfg( - hf_hub_id='laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-rewind', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=1024), diff --git a/timm/models/vision_transformer.py b/timm/models/vision_transformer.py index b3b0ddca07..63526c93b7 100644 --- a/timm/models/vision_transformer.py +++ b/timm/models/vision_transformer.py @@ -1556,9 +1556,6 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0), - 'vit_base_patch32_clip_224.laion2b_ft_in12k': _cfg( - #hf_hub_id='timm/vit_base_patch32_clip_224.laion2b_ft_in12k', # FIXME weight exists, need to push - mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821), 'vit_base_patch16_clip_224.laion2b_ft_in12k': _cfg( hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821), @@ -1569,9 +1566,6 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=11821), - 'vit_base_patch32_clip_224.openai_ft_in12k': _cfg( - # hf_hub_id='timm/vit_base_patch32_clip_224.openai_ft_in12k', # FIXME weight exists, need to push - mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821), 'vit_base_patch16_clip_224.openai_ft_in12k': _cfg( hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821), @@ -1580,28 +1574,22 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=11821), 'vit_base_patch32_clip_224.laion2b': _cfg( - hf_hub_id='laion/CLIP-ViT-B-32-laion2B-s34B-b79K', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512), 'vit_base_patch16_clip_224.laion2b': _cfg( - hf_hub_id='laion/CLIP-ViT-B-16-laion2B-s34B-b88K', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512), 'vit_large_patch14_clip_224.laion2b': _cfg( - hf_hub_id='laion/CLIP-ViT-L-14-laion2B-s32B-b82K', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, crop_pct=1.0, num_classes=768), 'vit_huge_patch14_clip_224.laion2b': _cfg( - hf_hub_id='laion/CLIP-ViT-H-14-laion2B-s32B-b79K', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1024), 'vit_giant_patch14_clip_224.laion2b': _cfg( - hf_hub_id='laion/CLIP-ViT-g-14-laion2B-s12B-b42K', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1024), 'vit_gigantic_patch14_clip_224.laion2b': _cfg( - hf_hub_id='laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1280), 'vit_base_patch32_clip_224.laion400m_e32': _cfg( @@ -1620,21 +1608,17 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768), 'vit_base_patch32_clip_224.datacompxl': _cfg( - hf_hub_id='laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512), 'vit_base_patch32_clip_256.datacompxl': _cfg( - hf_hub_id='laion/CLIP-ViT-B-32-256x256-DataComp-s34B-b86K', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, input_size=(3, 256, 256), num_classes=512), 'vit_base_patch16_clip_224.datacompxl': _cfg( - hf_hub_id='laion/CLIP-ViT-B-16-DataComp.XL-s13B-b90K', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512), 'vit_large_patch14_clip_224.datacompxl': _cfg( - hf_hub_id='laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768), 'vit_base_patch16_clip_224.dfn2b': _cfg( @@ -1659,42 +1643,46 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: crop_pct=1.0, input_size=(3, 378, 378), num_classes=1024), 'vit_base_patch32_clip_224.metaclip_2pt5b': _cfg( - hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', license='cc-by-nc-4.0', notes=('natively QuickGELU, use quickgelu model variant for original results',), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512), 'vit_base_patch16_clip_224.metaclip_2pt5b': _cfg( - hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', license='cc-by-nc-4.0', notes=('natively QuickGELU, use quickgelu model variant for original results',), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512), 'vit_large_patch14_clip_224.metaclip_2pt5b': _cfg( - hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', license='cc-by-nc-4.0', notes=('natively QuickGELU, use quickgelu model variant for original results',), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768), 'vit_huge_patch14_clip_224.metaclip_2pt5b': _cfg( - hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', license='cc-by-nc-4.0', notes=('natively QuickGELU, use quickgelu model variant for original results',), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1024), + 'vit_huge_patch14_clip_224.metaclip_altogether': _cfg( + hf_hub_id='timm/', + license='cc-by-nc-4.0', + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1024), 'vit_gigantic_patch14_clip_224.metaclip_2pt5b': _cfg( - hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', license='cc-by-nc-4.0', notes=('natively QuickGELU, use quickgelu model variant for original results',), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1280), 'vit_base_patch32_clip_224.metaclip_400m': _cfg( - hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', license='cc-by-nc-4.0', notes=('natively QuickGELU, use quickgelu model variant for original results',), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512), 'vit_base_patch16_clip_224.metaclip_400m': _cfg( - hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', license='cc-by-nc-4.0', notes=('natively QuickGELU, use quickgelu model variant for original results',), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512), 'vit_large_patch14_clip_224.metaclip_400m': _cfg( - hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', license='cc-by-nc-4.0', notes=('natively QuickGELU, use quickgelu model variant for original results',), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768),