diff --git a/deep-learning/.DS_Store b/deep-learning/.DS_Store index 398d5eb4..4b16ec1b 100644 Binary files a/deep-learning/.DS_Store and b/deep-learning/.DS_Store differ diff --git a/deep-learning/Transformer-Tutorials/ViTMAE/ViT_MAE_visualization_demo.ipynb b/deep-learning/Transformer-Tutorials/ViTMAE/ViT_MAE_visualization_demo.ipynb new file mode 100644 index 00000000..0453c9f6 --- /dev/null +++ b/deep-learning/Transformer-Tutorials/ViTMAE/ViT_MAE_visualization_demo.ipynb @@ -0,0 +1,1409 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "ViT MAE visualization demo.ipynb", + "provenance": [], + "collapsed_sections": [], + "authorship_tag": "ABX9TyMgl6Ed1GSnla07Sp8p28aU", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "e4a3230655b4475e90f2f7fde0a9fbac": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "HBoxView", + "_dom_classes": [], + "_model_name": "HBoxModel", + "_view_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_view_count": null, + "_view_module_version": "1.5.0", + "box_style": "", + "layout": "IPY_MODEL_11f25d252aa64740abf09b2e69d027b3", + "_model_module": "@jupyter-widgets/controls", + "children": [ + "IPY_MODEL_c7f565f21c6640d6ab351000d7fd584e", + "IPY_MODEL_3f0a4cd7fb4e4d2e993edb059a4462d9", + "IPY_MODEL_b4db0afcbb9948319c96b33e1815fdad" + ] + } + }, + "11f25d252aa64740abf09b2e69d027b3": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_view_name": "LayoutView", + "grid_template_rows": null, + "right": null, + "justify_content": null, + "_view_module": "@jupyter-widgets/base", + "overflow": null, + "_model_module_version": "1.2.0", + "_view_count": null, + "flex_flow": null, + "width": null, + "min_width": null, + "border": null, + "align_items": null, + "bottom": null, + "_model_module": "@jupyter-widgets/base", + "top": null, + "grid_column": null, + "overflow_y": null, + "overflow_x": null, + "grid_auto_flow": null, + "grid_area": null, + "grid_template_columns": null, + "flex": null, + "_model_name": "LayoutModel", + "justify_items": null, + "grid_row": null, + "max_height": null, + "align_content": null, + "visibility": null, + "align_self": null, + "height": null, + "min_height": null, + "padding": null, + "grid_auto_rows": null, + "grid_gap": null, + "max_width": null, + "order": null, + "_view_module_version": "1.2.0", + "grid_template_areas": null, + "object_position": null, + "object_fit": null, + "grid_auto_columns": null, + "margin": null, + "display": null, + "left": null + } + }, + "c7f565f21c6640d6ab351000d7fd584e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "HTMLView", + "style": "IPY_MODEL_438b4c85c20b4eccbf0a3e6bb53d13aa", + "_dom_classes": [], + "description": "", + "_model_name": "HTMLModel", + "placeholder": "​", + "_view_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "value": "Downloading: 100%", + "_view_count": null, + "_view_module_version": "1.5.0", + "description_tooltip": null, + "_model_module": "@jupyter-widgets/controls", + "layout": "IPY_MODEL_5c7ca908b4d54c4797bf17ea404bd0fd" + } + }, + "3f0a4cd7fb4e4d2e993edb059a4462d9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "ProgressView", + "style": "IPY_MODEL_db89871792494f728d6025df1bc27a3f", + "_dom_classes": [], + "description": "", + "_model_name": "FloatProgressModel", + "bar_style": "success", + "max": 271, + "_view_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "value": 271, + "_view_count": null, + "_view_module_version": "1.5.0", + "orientation": "horizontal", + "min": 0, + "description_tooltip": null, + "_model_module": "@jupyter-widgets/controls", + "layout": "IPY_MODEL_8afeaf84a01047d988de6f6981135807" + } + }, + "b4db0afcbb9948319c96b33e1815fdad": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "HTMLView", + "style": "IPY_MODEL_7d707ec996e345c19b4766be71584a27", + "_dom_classes": [], + "description": "", + "_model_name": "HTMLModel", + "placeholder": "​", + "_view_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "value": " 271/271 [00:00<00:00, 7.03kB/s]", + "_view_count": null, + "_view_module_version": "1.5.0", + "description_tooltip": null, + "_model_module": "@jupyter-widgets/controls", + "layout": "IPY_MODEL_72268e98134d44fd998ed73967eac1be" + } + }, + "438b4c85c20b4eccbf0a3e6bb53d13aa": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "StyleView", + "_model_name": "DescriptionStyleModel", + "description_width": "", + "_view_module": "@jupyter-widgets/base", + "_model_module_version": "1.5.0", + "_view_count": null, + "_view_module_version": "1.2.0", + "_model_module": "@jupyter-widgets/controls" + } + }, + "5c7ca908b4d54c4797bf17ea404bd0fd": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_view_name": "LayoutView", + "grid_template_rows": null, + "right": null, + "justify_content": null, + "_view_module": "@jupyter-widgets/base", + "overflow": null, + "_model_module_version": "1.2.0", + "_view_count": null, + "flex_flow": null, + "width": null, + "min_width": null, + "border": null, + "align_items": null, + "bottom": null, + "_model_module": "@jupyter-widgets/base", + "top": null, + "grid_column": null, + "overflow_y": null, + "overflow_x": null, + "grid_auto_flow": null, + "grid_area": null, + "grid_template_columns": null, + "flex": null, + "_model_name": "LayoutModel", + "justify_items": null, + "grid_row": null, + "max_height": null, + "align_content": null, + "visibility": null, + "align_self": null, + "height": null, + "min_height": null, + "padding": null, + "grid_auto_rows": null, + "grid_gap": null, + "max_width": null, + "order": null, + "_view_module_version": "1.2.0", + "grid_template_areas": null, + "object_position": null, + "object_fit": null, + "grid_auto_columns": null, + "margin": null, + "display": null, + "left": null + } + }, + "db89871792494f728d6025df1bc27a3f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "StyleView", + "_model_name": "ProgressStyleModel", + "description_width": "", + "_view_module": "@jupyter-widgets/base", + "_model_module_version": "1.5.0", + "_view_count": null, + "_view_module_version": "1.2.0", + "bar_color": null, + "_model_module": "@jupyter-widgets/controls" + } + }, + "8afeaf84a01047d988de6f6981135807": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_view_name": "LayoutView", + "grid_template_rows": null, + "right": null, + "justify_content": null, + "_view_module": "@jupyter-widgets/base", + "overflow": null, + "_model_module_version": "1.2.0", + "_view_count": null, + "flex_flow": null, + "width": null, + "min_width": null, + "border": null, + "align_items": null, + "bottom": null, + "_model_module": "@jupyter-widgets/base", + "top": null, + "grid_column": null, + "overflow_y": null, + "overflow_x": null, + "grid_auto_flow": null, + "grid_area": null, + "grid_template_columns": null, + "flex": null, + "_model_name": "LayoutModel", + "justify_items": null, + "grid_row": null, + "max_height": null, + "align_content": null, + "visibility": null, + "align_self": null, + "height": null, + "min_height": null, + "padding": null, + "grid_auto_rows": null, + "grid_gap": null, + "max_width": null, + "order": null, + "_view_module_version": "1.2.0", + "grid_template_areas": null, + "object_position": null, + "object_fit": null, + "grid_auto_columns": null, + "margin": null, + "display": null, + "left": null + } + }, + "7d707ec996e345c19b4766be71584a27": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "StyleView", + "_model_name": "DescriptionStyleModel", + "description_width": "", + "_view_module": "@jupyter-widgets/base", + "_model_module_version": "1.5.0", + "_view_count": null, + "_view_module_version": "1.2.0", + "_model_module": "@jupyter-widgets/controls" + } + }, + "72268e98134d44fd998ed73967eac1be": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_view_name": "LayoutView", + "grid_template_rows": null, + "right": null, + "justify_content": null, + "_view_module": "@jupyter-widgets/base", + "overflow": null, + "_model_module_version": "1.2.0", + "_view_count": null, + "flex_flow": null, + "width": null, + "min_width": null, + "border": null, + "align_items": null, + "bottom": null, + "_model_module": "@jupyter-widgets/base", + "top": null, + "grid_column": null, + "overflow_y": null, + "overflow_x": null, + "grid_auto_flow": null, + "grid_area": null, + "grid_template_columns": null, + "flex": null, + "_model_name": "LayoutModel", + "justify_items": null, + "grid_row": null, + "max_height": null, + "align_content": null, + "visibility": null, + "align_self": null, + "height": null, + "min_height": null, + "padding": null, + "grid_auto_rows": null, + "grid_gap": null, + "max_width": null, + "order": null, + "_view_module_version": "1.2.0", + "grid_template_areas": null, + "object_position": null, + "object_fit": null, + "grid_auto_columns": null, + "margin": null, + "display": null, + "left": null + } + }, + "f92cc207024740e385e53c1b6b9d229a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "HBoxView", + "_dom_classes": [], + "_model_name": "HBoxModel", + "_view_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_view_count": null, + "_view_module_version": "1.5.0", + "box_style": "", + "layout": "IPY_MODEL_1dd8f297a5a24dd5930e9bac173b87bc", + "_model_module": "@jupyter-widgets/controls", + "children": [ + "IPY_MODEL_fa771f9b26254260be58c9859d5d5c48", + "IPY_MODEL_3c7bbda6a76a479a8c65a5f33e0b0bb7", + "IPY_MODEL_b6c02fc26c63419689d377c61e7306ca" + ] + } + }, + "1dd8f297a5a24dd5930e9bac173b87bc": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_view_name": "LayoutView", + "grid_template_rows": null, + "right": null, + "justify_content": null, + "_view_module": "@jupyter-widgets/base", + "overflow": null, + "_model_module_version": "1.2.0", + "_view_count": null, + "flex_flow": null, + "width": null, + "min_width": null, + "border": null, + "align_items": null, + "bottom": null, + "_model_module": "@jupyter-widgets/base", + "top": null, + "grid_column": null, + "overflow_y": null, + "overflow_x": null, + "grid_auto_flow": null, + "grid_area": null, + "grid_template_columns": null, + "flex": null, + "_model_name": "LayoutModel", + "justify_items": null, + "grid_row": null, + "max_height": null, + "align_content": null, + "visibility": null, + "align_self": null, + "height": null, + "min_height": null, + "padding": null, + "grid_auto_rows": null, + "grid_gap": null, + "max_width": null, + "order": null, + "_view_module_version": "1.2.0", + "grid_template_areas": null, + "object_position": null, + "object_fit": null, + "grid_auto_columns": null, + "margin": null, + "display": null, + "left": null + } + }, + "fa771f9b26254260be58c9859d5d5c48": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "HTMLView", + "style": "IPY_MODEL_f2c2bbf24ba145ef9a37c3a744b86fe3", + "_dom_classes": [], + "description": "", + "_model_name": "HTMLModel", + "placeholder": "​", + "_view_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "value": "Downloading: 100%", + "_view_count": null, + "_view_module_version": "1.5.0", + "description_tooltip": null, + "_model_module": "@jupyter-widgets/controls", + "layout": "IPY_MODEL_85a4ee62f660428d84732e81dc3a5ab6" + } + }, + "3c7bbda6a76a479a8c65a5f33e0b0bb7": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "ProgressView", + "style": "IPY_MODEL_e3821fbaa11b4950a0735079fff41757", + "_dom_classes": [], + "description": "", + "_model_name": "FloatProgressModel", + "bar_style": "success", + "max": 676, + "_view_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "value": 676, + "_view_count": null, + "_view_module_version": "1.5.0", + "orientation": "horizontal", + "min": 0, + "description_tooltip": null, + "_model_module": "@jupyter-widgets/controls", + "layout": "IPY_MODEL_b9753d4c7a6740879933e34507c0a885" + } + }, + "b6c02fc26c63419689d377c61e7306ca": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "HTMLView", + "style": "IPY_MODEL_0a6c8b2338df42679b1d8e025bf99348", + "_dom_classes": [], + "description": "", + "_model_name": "HTMLModel", + "placeholder": "​", + "_view_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "value": " 676/676 [00:00<00:00, 13.8kB/s]", + "_view_count": null, + "_view_module_version": "1.5.0", + "description_tooltip": null, + "_model_module": "@jupyter-widgets/controls", + "layout": "IPY_MODEL_799210215b99475fb871a2f949e82935" + } + }, + "f2c2bbf24ba145ef9a37c3a744b86fe3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "StyleView", + "_model_name": "DescriptionStyleModel", + "description_width": "", + "_view_module": "@jupyter-widgets/base", + "_model_module_version": "1.5.0", + "_view_count": null, + "_view_module_version": "1.2.0", + "_model_module": "@jupyter-widgets/controls" + } + }, + "85a4ee62f660428d84732e81dc3a5ab6": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_view_name": "LayoutView", + "grid_template_rows": null, + "right": null, + "justify_content": null, + "_view_module": "@jupyter-widgets/base", + "overflow": null, + "_model_module_version": "1.2.0", + "_view_count": null, + "flex_flow": null, + "width": null, + "min_width": null, + "border": null, + "align_items": null, + "bottom": null, + "_model_module": "@jupyter-widgets/base", + "top": null, + "grid_column": null, + "overflow_y": null, + "overflow_x": null, + "grid_auto_flow": null, + "grid_area": null, + "grid_template_columns": null, + "flex": null, + "_model_name": "LayoutModel", + "justify_items": null, + "grid_row": null, + "max_height": null, + "align_content": null, + "visibility": null, + "align_self": null, + "height": null, + "min_height": null, + "padding": null, + "grid_auto_rows": null, + "grid_gap": null, + "max_width": null, + "order": null, + "_view_module_version": "1.2.0", + "grid_template_areas": null, + "object_position": null, + "object_fit": null, + "grid_auto_columns": null, + "margin": null, + "display": null, + "left": null + } + }, + "e3821fbaa11b4950a0735079fff41757": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "StyleView", + "_model_name": "ProgressStyleModel", + "description_width": "", + "_view_module": "@jupyter-widgets/base", + "_model_module_version": "1.5.0", + "_view_count": null, + "_view_module_version": "1.2.0", + "bar_color": null, + "_model_module": "@jupyter-widgets/controls" + } + }, + "b9753d4c7a6740879933e34507c0a885": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_view_name": "LayoutView", + "grid_template_rows": null, + "right": null, + "justify_content": null, + "_view_module": "@jupyter-widgets/base", + "overflow": null, + "_model_module_version": "1.2.0", + "_view_count": null, + "flex_flow": null, + "width": null, + "min_width": null, + "border": null, + "align_items": null, + "bottom": null, + "_model_module": "@jupyter-widgets/base", + "top": null, + "grid_column": null, + "overflow_y": null, + "overflow_x": null, + "grid_auto_flow": null, + "grid_area": null, + "grid_template_columns": null, + "flex": null, + "_model_name": "LayoutModel", + "justify_items": null, + "grid_row": null, + "max_height": null, + "align_content": null, + "visibility": null, + "align_self": null, + "height": null, + "min_height": null, + "padding": null, + "grid_auto_rows": null, + "grid_gap": null, + "max_width": null, + "order": null, + "_view_module_version": "1.2.0", + "grid_template_areas": null, + "object_position": null, + "object_fit": null, + "grid_auto_columns": null, + "margin": null, + "display": null, + "left": null + } + }, + "0a6c8b2338df42679b1d8e025bf99348": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "StyleView", + "_model_name": "DescriptionStyleModel", + "description_width": "", + "_view_module": "@jupyter-widgets/base", + "_model_module_version": "1.5.0", + "_view_count": null, + "_view_module_version": "1.2.0", + "_model_module": "@jupyter-widgets/controls" + } + }, + "799210215b99475fb871a2f949e82935": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_view_name": "LayoutView", + "grid_template_rows": null, + "right": null, + "justify_content": null, + "_view_module": "@jupyter-widgets/base", + "overflow": null, + "_model_module_version": "1.2.0", + "_view_count": null, + "flex_flow": null, + "width": null, + "min_width": null, + "border": null, + "align_items": null, + "bottom": null, + "_model_module": "@jupyter-widgets/base", + "top": null, + "grid_column": null, + "overflow_y": null, + "overflow_x": null, + "grid_auto_flow": null, + "grid_area": null, + "grid_template_columns": null, + "flex": null, + "_model_name": "LayoutModel", + "justify_items": null, + "grid_row": null, + "max_height": null, + "align_content": null, + "visibility": null, + "align_self": null, + "height": null, + "min_height": null, + "padding": null, + "grid_auto_rows": null, + "grid_gap": null, + "max_width": null, + "order": null, + "_view_module_version": "1.2.0", + "grid_template_areas": null, + "object_position": null, + "object_fit": null, + "grid_auto_columns": null, + "margin": null, + "display": null, + "left": null + } + }, + "6edb3b829a664b1ea330abe61ba0d0c7": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "HBoxView", + "_dom_classes": [], + "_model_name": "HBoxModel", + "_view_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_view_count": null, + "_view_module_version": "1.5.0", + "box_style": "", + "layout": "IPY_MODEL_bd920f2bc3734ba9b8d654b656daa32a", + "_model_module": "@jupyter-widgets/controls", + "children": [ + "IPY_MODEL_06cd6f5176474c31a39d993b11650717", + "IPY_MODEL_770d2d8a4b1a44d694d541eb1fc33a83", + "IPY_MODEL_6760933f3e544998ae7c148ee64e3f1b" + ] + } + }, + "bd920f2bc3734ba9b8d654b656daa32a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_view_name": "LayoutView", + "grid_template_rows": null, + "right": null, + "justify_content": null, + "_view_module": "@jupyter-widgets/base", + "overflow": null, + "_model_module_version": "1.2.0", + "_view_count": null, + "flex_flow": null, + "width": null, + "min_width": null, + "border": null, + "align_items": null, + "bottom": null, + "_model_module": "@jupyter-widgets/base", + "top": null, + "grid_column": null, + "overflow_y": null, + "overflow_x": null, + "grid_auto_flow": null, + "grid_area": null, + "grid_template_columns": null, + "flex": null, + "_model_name": "LayoutModel", + "justify_items": null, + "grid_row": null, + "max_height": null, + "align_content": null, + "visibility": null, + "align_self": null, + "height": null, + "min_height": null, + "padding": null, + "grid_auto_rows": null, + "grid_gap": null, + "max_width": null, + "order": null, + "_view_module_version": "1.2.0", + "grid_template_areas": null, + "object_position": null, + "object_fit": null, + "grid_auto_columns": null, + "margin": null, + "display": null, + "left": null + } + }, + "06cd6f5176474c31a39d993b11650717": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "HTMLView", + "style": "IPY_MODEL_efb12057e48e4ea5aa4f8df8b864a813", + "_dom_classes": [], + "description": "", + "_model_name": "HTMLModel", + "placeholder": "​", + "_view_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "value": "Downloading: 100%", + "_view_count": null, + "_view_module_version": "1.5.0", + "description_tooltip": null, + "_model_module": "@jupyter-widgets/controls", + "layout": "IPY_MODEL_fa28a6e0fc3640c6b212b0563f265097" + } + }, + "770d2d8a4b1a44d694d541eb1fc33a83": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "ProgressView", + "style": "IPY_MODEL_90bc224b4db84fc0ae569dd36f39c804", + "_dom_classes": [], + "description": "", + "_model_name": "FloatProgressModel", + "bar_style": "success", + "max": 447760421, + "_view_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "value": 447760421, + "_view_count": null, + "_view_module_version": "1.5.0", + "orientation": "horizontal", + "min": 0, + "description_tooltip": null, + "_model_module": "@jupyter-widgets/controls", + "layout": "IPY_MODEL_fea6cedc00314d559be0a86db5aedf94" + } + }, + "6760933f3e544998ae7c148ee64e3f1b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "HTMLView", + "style": "IPY_MODEL_ea056ee72e6d440496b3eb37a6ceed48", + "_dom_classes": [], + "description": "", + "_model_name": "HTMLModel", + "placeholder": "​", + "_view_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "value": " 427M/427M [00:13<00:00, 40.3MB/s]", + "_view_count": null, + "_view_module_version": "1.5.0", + "description_tooltip": null, + "_model_module": "@jupyter-widgets/controls", + "layout": "IPY_MODEL_c741fa8d57e141feab1a3f089bc34fc2" + } + }, + "efb12057e48e4ea5aa4f8df8b864a813": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "StyleView", + "_model_name": "DescriptionStyleModel", + "description_width": "", + "_view_module": "@jupyter-widgets/base", + "_model_module_version": "1.5.0", + "_view_count": null, + "_view_module_version": "1.2.0", + "_model_module": "@jupyter-widgets/controls" + } + }, + "fa28a6e0fc3640c6b212b0563f265097": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_view_name": "LayoutView", + "grid_template_rows": null, + "right": null, + "justify_content": null, + "_view_module": "@jupyter-widgets/base", + "overflow": null, + "_model_module_version": "1.2.0", + "_view_count": null, + "flex_flow": null, + "width": null, + "min_width": null, + "border": null, + "align_items": null, + "bottom": null, + "_model_module": "@jupyter-widgets/base", + "top": null, + "grid_column": null, + "overflow_y": null, + "overflow_x": null, + "grid_auto_flow": null, + "grid_area": null, + "grid_template_columns": null, + "flex": null, + "_model_name": "LayoutModel", + "justify_items": null, + "grid_row": null, + "max_height": null, + "align_content": null, + "visibility": null, + "align_self": null, + "height": null, + "min_height": null, + "padding": null, + "grid_auto_rows": null, + "grid_gap": null, + "max_width": null, + "order": null, + "_view_module_version": "1.2.0", + "grid_template_areas": null, + "object_position": null, + "object_fit": null, + "grid_auto_columns": null, + "margin": null, + "display": null, + "left": null + } + }, + "90bc224b4db84fc0ae569dd36f39c804": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "StyleView", + "_model_name": "ProgressStyleModel", + "description_width": "", + "_view_module": "@jupyter-widgets/base", + "_model_module_version": "1.5.0", + "_view_count": null, + "_view_module_version": "1.2.0", + "bar_color": null, + "_model_module": "@jupyter-widgets/controls" + } + }, + "fea6cedc00314d559be0a86db5aedf94": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_view_name": "LayoutView", + "grid_template_rows": null, + "right": null, + "justify_content": null, + "_view_module": "@jupyter-widgets/base", + "overflow": null, + "_model_module_version": "1.2.0", + "_view_count": null, + "flex_flow": null, + "width": null, + "min_width": null, + "border": null, + "align_items": null, + "bottom": null, + "_model_module": "@jupyter-widgets/base", + "top": null, + "grid_column": null, + "overflow_y": null, + "overflow_x": null, + "grid_auto_flow": null, + "grid_area": null, + "grid_template_columns": null, + "flex": null, + "_model_name": "LayoutModel", + "justify_items": null, + "grid_row": null, + "max_height": null, + "align_content": null, + "visibility": null, + "align_self": null, + "height": null, + "min_height": null, + "padding": null, + "grid_auto_rows": null, + "grid_gap": null, + "max_width": null, + "order": null, + "_view_module_version": "1.2.0", + "grid_template_areas": null, + "object_position": null, + "object_fit": null, + "grid_auto_columns": null, + "margin": null, + "display": null, + "left": null + } + }, + "ea056ee72e6d440496b3eb37a6ceed48": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "StyleView", + "_model_name": "DescriptionStyleModel", + "description_width": "", + "_view_module": "@jupyter-widgets/base", + "_model_module_version": "1.5.0", + "_view_count": null, + "_view_module_version": "1.2.0", + "_model_module": "@jupyter-widgets/controls" + } + }, + "c741fa8d57e141feab1a3f089bc34fc2": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_view_name": "LayoutView", + "grid_template_rows": null, + "right": null, + "justify_content": null, + "_view_module": "@jupyter-widgets/base", + "overflow": null, + "_model_module_version": "1.2.0", + "_view_count": null, + "flex_flow": null, + "width": null, + "min_width": null, + "border": null, + "align_items": null, + "bottom": null, + "_model_module": "@jupyter-widgets/base", + "top": null, + "grid_column": null, + "overflow_y": null, + "overflow_x": null, + "grid_auto_flow": null, + "grid_area": null, + "grid_template_columns": null, + "flex": null, + "_model_name": "LayoutModel", + "justify_items": null, + "grid_row": null, + "max_height": null, + "align_content": null, + "visibility": null, + "align_self": null, + "height": null, + "min_height": null, + "padding": null, + "grid_auto_rows": null, + "grid_gap": null, + "max_width": null, + "order": null, + "_view_module_version": "1.2.0", + "grid_template_areas": null, + "object_position": null, + "object_fit": null, + "grid_auto_columns": null, + "margin": null, + "display": null, + "left": null + } + } + } + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Visualization demo: Masked Autoencoders (MAE)\n", + "\n", + "In this notebook, we are going to visualize some predictions of a Vision Transformer (ViT) pre-trained on a very simple objective, namely masked patch prediction. The model needs to reconstruct pixel values for masked patches (as shown in the figure below).\n", + "\n", + "\n", + "\n", + "* Paper: https://arxiv.org/abs/2111.06377\n", + "* Original repo (on which this notebook is based): https://github.com/facebookresearch/mae\n", + "* Video explaining the paper: https://www.youtube.com/watch?v=Dp6iICL2dVI \n", + "\n", + "## Set-up environment" + ], + "metadata": { + "id": "CTic-E6Gdcw5" + } + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "7nhbrpiTdUJY", + "outputId": "84aa2996-b166-4284-9724-52c2be5e70f5" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing wheel metadata ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[K |████████████████████████████████| 67 kB 2.6 MB/s \n", + "\u001b[K |████████████████████████████████| 596 kB 11.0 MB/s \n", + "\u001b[K |████████████████████████████████| 6.8 MB 52.2 MB/s \n", + "\u001b[K |████████████████████████████████| 895 kB 73.6 MB/s \n", + "\u001b[?25h Building wheel for transformers (PEP 517) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "!pip install -q git+https://github.com/huggingface/transformers.git" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Prepare image\n", + "\n", + "Here we apply some very basic image processing, namely resizing to 224x224 and normalizing the channels." + ], + "metadata": { + "id": "4Jmt6oBNdpJq" + } + }, + { + "cell_type": "code", + "source": [ + "from transformers import ViTFeatureExtractor\n", + "import requests\n", + "from PIL import Image\n", + "\n", + "feature_extractor = ViTFeatureExtractor.from_pretrained(\"facebook/vit-mae-base\")\n", + "url = \"https://user-images.githubusercontent.com/11435359/147738734-196fd92f-9260-48d5-ba7e-bf103d29364d.jpg\"\n", + "image = Image.open(requests.get(url, stream=True).raw)\n", + "image" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 273, + "referenced_widgets": [ + "e4a3230655b4475e90f2f7fde0a9fbac", + "11f25d252aa64740abf09b2e69d027b3", + "c7f565f21c6640d6ab351000d7fd584e", + "3f0a4cd7fb4e4d2e993edb059a4462d9", + "b4db0afcbb9948319c96b33e1815fdad", + "438b4c85c20b4eccbf0a3e6bb53d13aa", + "5c7ca908b4d54c4797bf17ea404bd0fd", + "db89871792494f728d6025df1bc27a3f", + "8afeaf84a01047d988de6f6981135807", + "7d707ec996e345c19b4766be71584a27", + "72268e98134d44fd998ed73967eac1be" + ] + }, + "id": "VqjStfjcdenC", + "outputId": "47b6f4b7-7279-4489-a12d-a9778dcd55b2" + }, + "execution_count": 2, + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e4a3230655b4475e90f2f7fde0a9fbac", + "version_minor": 0, + "version_major": 2 + }, + "text/plain": [ + "Downloading: 0%| | 0.00/271 [00:00" + ] + }, + "metadata": {}, + "execution_count": 2 + } + ] + }, + { + "cell_type": "code", + "source": [ + "pixel_values = feature_extractor(image, return_tensors=\"pt\").pixel_values" + ], + "metadata": { + "id": "QaIpYzC_d3Xv" + }, + "execution_count": 3, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Visualization\n", + "\n", + "Next, we forward the pixel values through the model. The encoder (which is a standard Vision Transformer) will first encode the visual patches. Next, a learnable mask token is added at the positions of the masked patches, and the decoder (which is also a Transformer) reconstructs the pixel values based on the encoded visual patches + mask tokens.\n", + "\n", + "The authors saw the best performance when masking out a large portion (75%) of the image patches." + ], + "metadata": { + "id": "qTKBqkbHd5-Y" + } + }, + { + "cell_type": "code", + "source": [ + "import torch\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "imagenet_mean = np.array(feature_extractor.image_mean)\n", + "imagenet_std = np.array(feature_extractor.image_std)\n", + "\n", + "def show_image(image, title=''):\n", + " # image is [H, W, 3]\n", + " assert image.shape[2] == 3\n", + " plt.imshow(torch.clip((image * imagenet_std + imagenet_mean) * 255, 0, 255).int())\n", + " plt.title(title, fontsize=16)\n", + " plt.axis('off')\n", + " return\n", + "\n", + "def visualize(pixel_values, model):\n", + " # forward pass\n", + " outputs = model(pixel_values)\n", + " y = model.unpatchify(outputs.logits)\n", + " y = torch.einsum('nchw->nhwc', y).detach().cpu()\n", + " \n", + " # visualize the mask\n", + " mask = outputs.mask.detach()\n", + " mask = mask.unsqueeze(-1).repeat(1, 1, model.config.patch_size**2 *3) # (N, H*W, p*p*3)\n", + " mask = model.unpatchify(mask) # 1 is removing, 0 is keeping\n", + " mask = torch.einsum('nchw->nhwc', mask).detach().cpu()\n", + " \n", + " x = torch.einsum('nchw->nhwc', pixel_values)\n", + "\n", + " # masked image\n", + " im_masked = x * (1 - mask)\n", + "\n", + " # MAE reconstruction pasted with visible patches\n", + " im_paste = x * (1 - mask) + y * mask\n", + "\n", + " # make the plt figure larger\n", + " plt.rcParams['figure.figsize'] = [24, 24]\n", + "\n", + " plt.subplot(1, 4, 1)\n", + " show_image(x[0], \"original\")\n", + "\n", + " plt.subplot(1, 4, 2)\n", + " show_image(im_masked[0], \"masked\")\n", + "\n", + " plt.subplot(1, 4, 3)\n", + " show_image(y[0], \"reconstruction\")\n", + "\n", + " plt.subplot(1, 4, 4)\n", + " show_image(im_paste[0], \"reconstruction + visible\")\n", + "\n", + " plt.show()" + ], + "metadata": { + "id": "CG0G2btrd6fl" + }, + "execution_count": 4, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from transformers import ViTMAEForPreTraining\n", + "\n", + "# make random mask reproducible (comment out to make it change)\n", + "torch.manual_seed(2)\n", + "\n", + "model = ViTMAEForPreTraining.from_pretrained(\"facebook/vit-mae-base\")\n", + "\n", + "visualize(pixel_values, model)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 324, + "referenced_widgets": [ + "f92cc207024740e385e53c1b6b9d229a", + "1dd8f297a5a24dd5930e9bac173b87bc", + "fa771f9b26254260be58c9859d5d5c48", + "3c7bbda6a76a479a8c65a5f33e0b0bb7", + "b6c02fc26c63419689d377c61e7306ca", + "f2c2bbf24ba145ef9a37c3a744b86fe3", + "85a4ee62f660428d84732e81dc3a5ab6", + "e3821fbaa11b4950a0735079fff41757", + "b9753d4c7a6740879933e34507c0a885", + "0a6c8b2338df42679b1d8e025bf99348", + "799210215b99475fb871a2f949e82935", + "6edb3b829a664b1ea330abe61ba0d0c7", + "bd920f2bc3734ba9b8d654b656daa32a", + "06cd6f5176474c31a39d993b11650717", + "770d2d8a4b1a44d694d541eb1fc33a83", + "6760933f3e544998ae7c148ee64e3f1b", + "efb12057e48e4ea5aa4f8df8b864a813", + "fa28a6e0fc3640c6b212b0563f265097", + "90bc224b4db84fc0ae569dd36f39c804", + "fea6cedc00314d559be0a86db5aedf94", + "ea056ee72e6d440496b3eb37a6ceed48", + "c741fa8d57e141feab1a3f089bc34fc2" + ] + }, + "id": "hN7hnFU4epXB", + "outputId": "ac4147cf-86d5-4b43-c9db-8f912be5edc0" + }, + "execution_count": 5, + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f92cc207024740e385e53c1b6b9d229a", + "version_minor": 0, + "version_major": 2 + }, + "text/plain": [ + "Downloading: 0%| | 0.00/676 [00:00" + ] + }, + "metadata": { + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## After pre-training\n", + "\n", + "Note that, after pre-training, you can directly load the weights of the encoder into a `ViTForImageClassification` and start fine-tuning/doing linear probing. Note that a warning will be printed, indicating that the weights of the decoder aren't used." + ], + "metadata": { + "id": "vSY6xg2hpkZA" + } + }, + { + "cell_type": "code", + "source": [ + "from transformers import ViTForImageClassification\n", + "\n", + "model = ViTForImageClassification.from_pretrained(\"facebook/vit-mae-base\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "A_LuqXMCiQvS", + "outputId": "498eb515-a55f-49f8-e752-5e61d3fdb036" + }, + "execution_count": 6, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "You are using a model of type vit_mae to instantiate a model of type vit. This is not supported for all configurations of models and can yield errors.\n", + "Some weights of the model checkpoint at facebook/vit-mae-base were not used when initializing ViTForImageClassification: ['decoder.decoder_layers.0.attention.attention.value.weight', 'decoder.decoder_layers.3.output.dense.bias', 'decoder.decoder_layers.5.intermediate.dense.bias', 'decoder.decoder_layers.5.attention.attention.key.weight', 'decoder.decoder_layers.3.attention.attention.key.weight', 'decoder.decoder_layers.4.intermediate.dense.weight', 'decoder.decoder_layers.0.layernorm_before.bias', 'decoder.decoder_layers.1.layernorm_before.bias', 'decoder.decoder_layers.1.attention.attention.key.bias', 'decoder.decoder_layers.2.attention.attention.key.bias', 'decoder.decoder_layers.5.attention.attention.query.bias', 'decoder.decoder_layers.7.layernorm_after.weight', 'decoder.decoder_layers.5.attention.attention.value.weight', 'decoder.decoder_layers.6.attention.attention.query.weight', 'decoder.decoder_layers.3.attention.attention.value.bias', 'decoder.decoder_layers.2.layernorm_before.weight', 'decoder.decoder_pred.bias', 'decoder.decoder_layers.2.attention.output.dense.bias', 'decoder.decoder_layers.1.output.dense.bias', 'decoder.decoder_layers.4.attention.attention.value.bias', 'decoder.decoder_layers.0.layernorm_after.weight', 'decoder.decoder_layers.6.layernorm_before.weight', 'decoder.decoder_layers.5.layernorm_before.weight', 'decoder.decoder_layers.7.attention.attention.value.bias', 'decoder.decoder_layers.7.intermediate.dense.weight', 'decoder.decoder_layers.3.intermediate.dense.weight', 'decoder.decoder_layers.5.layernorm_before.bias', 'decoder.decoder_layers.6.attention.attention.key.bias', 'decoder.decoder_layers.7.intermediate.dense.bias', 'decoder.decoder_layers.4.layernorm_after.weight', 'decoder.decoder_layers.5.output.dense.bias', 'decoder.decoder_layers.3.attention.attention.value.weight', 'decoder.decoder_layers.4.layernorm_after.bias', 'decoder.decoder_layers.1.attention.attention.query.bias', 'decoder.decoder_layers.1.attention.attention.value.weight', 'decoder.decoder_layers.0.attention.attention.query.bias', 'decoder.decoder_layers.0.layernorm_after.bias', 'decoder.decoder_layers.0.intermediate.dense.weight', 'decoder.decoder_norm.weight', 'decoder.decoder_layers.2.output.dense.weight', 'decoder.decoder_layers.3.intermediate.dense.bias', 'decoder.decoder_layers.6.attention.output.dense.weight', 'decoder.decoder_layers.0.attention.attention.query.weight', 'decoder.decoder_layers.4.output.dense.bias', 'decoder.decoder_layers.2.layernorm_after.bias', 'decoder.decoder_pred.weight', 'decoder.decoder_layers.4.layernorm_before.bias', 'decoder.decoder_layers.3.attention.output.dense.weight', 'decoder.decoder_layers.1.intermediate.dense.weight', 'decoder.decoder_layers.0.attention.attention.value.bias', 'decoder.decoder_layers.6.layernorm_after.weight', 'decoder.decoder_layers.0.attention.output.dense.weight', 'decoder.decoder_layers.7.output.dense.weight', 'decoder.decoder_layers.6.layernorm_after.bias', 'decoder.mask_token', 'decoder.decoder_layers.2.attention.attention.value.bias', 'decoder.decoder_layers.3.layernorm_after.weight', 'decoder.decoder_layers.7.attention.output.dense.weight', 'decoder.decoder_layers.2.intermediate.dense.bias', 'decoder.decoder_layers.0.output.dense.weight', 'decoder.decoder_layers.1.attention.output.dense.weight', 'decoder.decoder_layers.1.layernorm_after.bias', 'decoder.decoder_layers.2.output.dense.bias', 'decoder.decoder_layers.5.layernorm_after.weight', 'decoder.decoder_layers.6.attention.attention.value.bias', 'decoder.decoder_layers.6.attention.attention.key.weight', 'decoder.decoder_layers.6.output.dense.weight', 'decoder.decoder_layers.3.attention.attention.query.bias', 'decoder.decoder_layers.2.intermediate.dense.weight', 'decoder.decoder_layers.1.attention.output.dense.bias', 'decoder.decoder_layers.6.attention.output.dense.bias', 'decoder.decoder_layers.2.layernorm_after.weight', 'decoder.decoder_layers.2.attention.attention.query.weight', 'decoder.decoder_layers.2.layernorm_before.bias', 'decoder.decoder_layers.7.attention.attention.value.weight', 'decoder.decoder_layers.0.attention.output.dense.bias', 'decoder.decoder_layers.5.attention.attention.value.bias', 'decoder.decoder_layers.5.attention.attention.key.bias', 'decoder.decoder_layers.2.attention.output.dense.weight', 'decoder.decoder_layers.4.intermediate.dense.bias', 'decoder.decoder_layers.7.output.dense.bias', 'decoder.decoder_layers.4.layernorm_before.weight', 'decoder.decoder_layers.1.attention.attention.query.weight', 'decoder.decoder_pos_embed', 'decoder.decoder_norm.bias', 'decoder.decoder_layers.1.attention.attention.value.bias', 'decoder.decoder_layers.1.intermediate.dense.bias', 'decoder.decoder_layers.6.intermediate.dense.weight', 'decoder.decoder_layers.6.attention.attention.query.bias', 'decoder.decoder_layers.5.layernorm_after.bias', 'decoder.decoder_layers.1.output.dense.weight', 'decoder.decoder_layers.6.intermediate.dense.bias', 'decoder.decoder_layers.6.output.dense.bias', 'decoder.decoder_embed.bias', 'decoder.decoder_embed.weight', 'decoder.decoder_layers.7.attention.attention.key.bias', 'decoder.decoder_layers.7.layernorm_before.weight', 'decoder.decoder_layers.7.attention.attention.query.weight', 'decoder.decoder_layers.3.layernorm_before.weight', 'decoder.decoder_layers.7.layernorm_before.bias', 'decoder.decoder_layers.3.layernorm_before.bias', 'decoder.decoder_layers.1.layernorm_before.weight', 'decoder.decoder_layers.7.attention.output.dense.bias', 'decoder.decoder_layers.5.output.dense.weight', 'decoder.decoder_layers.0.intermediate.dense.bias', 'decoder.decoder_layers.7.attention.attention.query.bias', 'decoder.decoder_layers.4.attention.output.dense.bias', 'decoder.decoder_layers.7.layernorm_after.bias', 'decoder.decoder_layers.5.attention.output.dense.bias', 'decoder.decoder_layers.0.layernorm_before.weight', 'decoder.decoder_layers.4.attention.output.dense.weight', 'decoder.decoder_layers.4.attention.attention.query.bias', 'decoder.decoder_layers.1.attention.attention.key.weight', 'decoder.decoder_layers.5.attention.attention.query.weight', 'decoder.decoder_layers.0.attention.attention.key.weight', 'decoder.decoder_layers.4.attention.attention.key.weight', 'decoder.decoder_layers.2.attention.attention.key.weight', 'decoder.decoder_layers.6.attention.attention.value.weight', 'decoder.decoder_layers.3.attention.attention.query.weight', 'decoder.decoder_layers.7.attention.attention.key.weight', 'decoder.decoder_layers.4.output.dense.weight', 'decoder.decoder_layers.2.attention.attention.query.bias', 'decoder.decoder_layers.3.layernorm_after.bias', 'decoder.decoder_layers.5.intermediate.dense.weight', 'decoder.decoder_layers.4.attention.attention.query.weight', 'decoder.decoder_layers.5.attention.output.dense.weight', 'decoder.decoder_layers.6.layernorm_before.bias', 'decoder.decoder_layers.0.output.dense.bias', 'decoder.decoder_layers.4.attention.attention.value.weight', 'decoder.decoder_layers.1.layernorm_after.weight', 'decoder.decoder_layers.2.attention.attention.value.weight', 'decoder.decoder_layers.3.attention.attention.key.bias', 'decoder.decoder_layers.4.attention.attention.key.bias', 'decoder.decoder_layers.3.attention.output.dense.bias', 'decoder.decoder_layers.3.output.dense.weight', 'decoder.decoder_layers.0.attention.attention.key.bias']\n", + "- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", + "Some weights of ViTForImageClassification were not initialized from the model checkpoint at facebook/vit-mae-base and are newly initialized: ['classifier.bias', 'classifier.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "" + ], + "metadata": { + "id": "xhwzUBsdmE-y" + }, + "execution_count": 6, + "outputs": [] + } + ] +} \ No newline at end of file