references.bib

@article{Fukushima1980,
  doi = {10.1007/bf00344251},
  url = {https://doi.org/10.1007/bf00344251},
  year = {1980},
  month = apr,
  publisher = {Springer Science and Business Media {LLC}},
  volume = {36},
  number = {4},
  pages = {193--202},
  author = {Kunihiko Fukushima},
  title = {Neocognitron: A self-organizing neural network model for a mechanism of pattern recognition unaffected by shift in position},
  journal = {Biological Cybernetics}
}

@ARTICLE{726791,
author={Lecun, Y. and Bottou, L. and Bengio, Y. and Haffner, P.},
journal={Proceedings of the IEEE}, 
title={Gradient-based learning applied to document recognition}, 
year={1998},
volume={86},
number={11},
pages={2278-2324},
doi={10.1109/5.726791}
}

@ARTICLE{6795724,
author={LeCun, Y. and Boser, B. and Denker, J. S. and Henderson, D. and Howard, R. E. and Hubbard, W. and Jackel, L. D.},
journal={Neural Computation}, 
title={Backpropagation Applied to Handwritten Zip Code Recognition}, 
year={1989},
volume={1},
number={4},
pages={541-551},
doi={10.1162/neco.1989.1.4.541}
}

@book{Goodfellow-et-al-2016,
    title={Deep Learning},
    author={Ian Goodfellow and Yoshua Bengio and Aaron Courville},
    publisher={MIT Press},
    note={\url{http://www.deeplearningbook.org}},
    year={2016}
}

@book{Arndt2011,
  doi = {10.1007/978-3-642-14764-7},
  url = {https://doi.org/10.1007/978-3-642-14764-7},
  year = {2011},
  publisher = {Springer Berlin Heidelberg},
  author = {J\"{o}rg Arndt},
  title = {Matters Computational}
}

@article{Zhou1988ComputationOO,
  title={Computation of optical flow using a neural network},
  author={Yi-Tong Zhou and Rama Chellappa},
  journal={IEEE 1988 International Conference on Neural Networks},
  year={1988},
  pages={71-78 vol.2},
  url={https://api.semanticscholar.org/CorpusID:7292956}
}

@misc{lin2014network,
      title={Network In Network}, 
      author={Min Lin and Qiang Chen and Shuicheng Yan},
      year={2014},
      eprint={1312.4400},
      archivePrefix={arXiv},
      primaryClass={cs.NE}
}

@article{Zou2023,
  doi = {10.1109/jproc.2023.3238524},
  url = {https://doi.org/10.1109/jproc.2023.3238524},
  year = {2023},
  month = mar,
  publisher = {Institute of Electrical and Electronics Engineers ({IEEE})},
  volume = {111},
  number = {3},
  pages = {257--276},
  author = {Zhengxia Zou and Keyan Chen and Zhenwei Shi and Yuhong Guo and Jieping Ye},
  title = {Object Detection in 20 Years: A Survey},
  journal = {Proceedings of the {IEEE}}
}

@article{Zaidi2022,
  doi = {10.1016/j.dsp.2022.103514},
  url = {https://doi.org/10.1016/j.dsp.2022.103514},
  year = {2022},
  month = jun,
  publisher = {Elsevier {BV}},
  volume = {126},
  pages = {103514},
  author = {Syed Sahil Abbas Zaidi and Mohammad Samar Ansari and Asra Aslam and Nadia Kanwal and Mamoona Asghar and Brian Lee},
  title = {A survey of modern deep learning based object detection models},
  journal = {Digital Signal Processing}
}

@article{Viola2004,
  doi = {10.1023/b:visi.0000013087.49260.fb},
  url = {https://doi.org/10.1023/b:visi.0000013087.49260.fb},
  year = {2004},
  month = may,
  publisher = {Springer Science and Business Media {LLC}},
  volume = {57},
  number = {2},
  pages = {137--154},
  author = {Paul Viola and Michael J. Jones},
  title = {Robust Real-Time Face Detection},
  journal = {International Journal of Computer Vision}
}

@INPROCEEDINGS{990517,
author={Viola, P. and Jones, M.},
booktitle={Proceedings of the 2001 IEEE Computer Society Conference on Computer Vision and Pattern Recognition. CVPR 2001}, 
title={Rapid object detection using a boosted cascade of simple features}, 
year={2001},
volume={1},
number={},
pages={I-I},
doi={10.1109/CVPR.2001.990517}
}

@INPROCEEDINGS{1467360,
author={Dalal, N. and Triggs, B.},
booktitle={2005 IEEE Computer Society Conference on Computer Vision and Pattern Recognition (CVPR'05)}, 
title={Histograms of oriented gradients for human detection}, 
year={2005},
volume={1},
number={},
pages={886-893 vol. 1},
doi={10.1109/CVPR.2005.177}
}

@article{GU2022104401,
title = {A review on 2D instance segmentation based on deep neural networks},
journal = {Image and Vision Computing},
volume = {120},
pages = {104401},
year = {2022},
issn = {0262-8856},
doi = {https://doi.org/10.1016/j.imavis.2022.104401},
url = {https://www.sciencedirect.com/science/article/pii/S0262885622000300},
author = {Wenchao Gu and Shuang Bai and Lingxing Kong},
keywords = {Instance segmentation, Deep neural networks, Computer vision, Review},
}

@ARTICLE{9356353,
author={Minaee, Shervin and Boykov, Yuri and Porikli, Fatih and Plaza, Antonio and Kehtarnavaz, Nasser and Terzopoulos, Demetri},
journal={IEEE Transactions on Pattern Analysis and Machine Intelligence}, 
title={Image Segmentation Using Deep Learning: A Survey}, 
year={2022},
volume={44},
number={7},
pages={3523-3542},
doi={10.1109/TPAMI.2021.3059968}
}

@InProceedings{10.1007/978-3-319-10584-0_20,
author="Hariharan, Bharath
and Arbel{\'a}ez, Pablo
and Girshick, Ross
and Malik, Jitendra",
editor="Fleet, David
and Pajdla, Tomas
and Schiele, Bernt
and Tuytelaars, Tinne",
title="Simultaneous Detection and Segmentation",
booktitle="Computer Vision -- ECCV 2014",
year="2014",
publisher="Springer International Publishing",
address="Cham",
pages="297--312",
}

@misc{wang2023internimage,
      title={InternImage: Exploring Large-Scale Vision Foundation Models with Deformable Convolutions}, 
      author={Wenhai Wang and Jifeng Dai and Zhe Chen and Zhenhang Huang and Zhiqi Li and Xizhou Zhu and Xiaowei Hu and Tong Lu and Lewei Lu and Hongsheng Li and Xiaogang Wang and Yu Qiao},
      year={2023},
      eprint={2211.05778},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{liu2022convnet,
      title={A ConvNet for the 2020s}, 
      author={Zhuang Liu and Hanzi Mao and Chao-Yuan Wu and Christoph Feichtenhofer and Trevor Darrell and Saining Xie},
      year={2022},
      eprint={2201.03545},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{tan2021efficientnetv2,
      title={EfficientNetV2: Smaller Models and Faster Training}, 
      author={Mingxing Tan and Quoc V. Le},
      year={2021},
      eprint={2104.00298},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{tan2020efficientdet,
      title={EfficientDet: Scalable and Efficient Object Detection}, 
      author={Mingxing Tan and Ruoming Pang and Quoc V. Le},
      year={2020},
      eprint={1911.09070},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@article{Jaccard1912,
  doi = {10.1111/j.1469-8137.1912.tb05611.x},
  url = {https://doi.org/10.1111/j.1469-8137.1912.tb05611.x},
  year = {1912},
  month = feb,
  publisher = {Wiley},
  volume = {11},
  number = {2},
  pages = {37--50},
  author = {Paul Jaccard},
  title = {{THE} {DISTRIBUTION} {OF} {THE} {FLORA} {IN} {THE} {ALPINE} {ZONE}.1},
  journal = {New Phytologist}
}

@article{10.5169/SEALS-266450,
  doi = {10.5169/SEALS-266450},
  url = {https://www.e-periodica.ch/digbib/view?pid=bsv-002:1901:37::790},
  author = {{Jaccard,  Paul}},
  title = {Étude comparative de la distribution florale dans une portion des Alpes et du Jura},
  publisher = {Imprimerie Corbaz & Comp.},
  year = {1901}
}

@misc{lin2015microsoft,
      title={Microsoft COCO: Common Objects in Context}, 
      author={Tsung-Yi Lin and Michael Maire and Serge Belongie and Lubomir Bourdev and Ross Girshick and James Hays and Pietro Perona and Deva Ramanan and C. Lawrence Zitnick and Piotr Dollár},
      year={2015},
      eprint={1405.0312},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@article{DBLP:journals/corr/ZeilerF13,
  author       = {Matthew D. Zeiler and
                  Rob Fergus},
  title        = {Visualizing and Understanding Convolutional Networks},
  journal      = {CoRR},
  volume       = {abs/1311.2901},
  year         = {2013},
  url          = {http://arxiv.org/abs/1311.2901},
  eprinttype    = {arXiv},
  eprint       = {1311.2901},
  timestamp    = {Mon, 13 Aug 2018 16:48:37 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/ZeilerF13.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{Bengio2009,
  doi = {10.1561/2200000006},
  url = {https://doi.org/10.1561/2200000006},
  year = {2009},
  publisher = {Now Publishers},
  volume = {2},
  number = {1},
  pages = {1--127},
  author = {Y. Bengio},
  title = {Learning Deep Architectures for {AI}},
  journal = {Foundations and Trends{\textregistered} in Machine Learning}
}

@InProceedings{Jung_2021_ICCV,
    author    = {Jung, Hyungsik and Oh, Youngrock},
    title     = {Towards Better Explanations of Class Activation Mapping},
    booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
    month     = {October},
    year      = {2021},
    pages     = {1336-1344}
}

@article{DBLP:journals/corr/SelvarajuDVCPB16,
  author       = {Ramprasaath R. Selvaraju and
                  Abhishek Das and
                  Ramakrishna Vedantam and
                  Michael Cogswell and
                  Devi Parikh and
                  Dhruv Batra},
  title        = {Grad-CAM: Why did you say that? Visual Explanations from Deep Networks
                  via Gradient-based Localization},
  journal      = {CoRR},
  volume       = {abs/1610.02391},
  year         = {2016},
  url          = {http://arxiv.org/abs/1610.02391},
  eprinttype    = {arXiv},
  eprint       = {1610.02391},
  timestamp    = {Mon, 13 Aug 2018 16:46:58 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/SelvarajuDVCPB16.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}


@article{DBLP:journals/corr/abs-1710-11063,
  author       = {Aditya Chattopadhyay and
                  Anirban Sarkar and
                  Prantik Howlader and
                  Vineeth N. Balasubramanian},
  title        = {Grad-CAM++: Generalized Gradient-based Visual Explanations for Deep
                  Convolutional Networks},
  journal      = {CoRR},
  volume       = {abs/1710.11063},
  year         = {2017},
  url          = {http://arxiv.org/abs/1710.11063},
  eprinttype    = {arXiv},
  eprint       = {1710.11063},
  timestamp    = {Tue, 02 Aug 2022 09:11:19 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1710-11063.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/ZhouKLOT15,
  author       = {Bolei Zhou and
                  Aditya Khosla and
                  {\`{A}}gata Lapedriza and
                  Aude Oliva and
                  Antonio Torralba},
  title        = {Learning Deep Features for Discriminative Localization},
  journal      = {CoRR},
  volume       = {abs/1512.04150},
  year         = {2015},
  url          = {http://arxiv.org/abs/1512.04150},
  eprinttype    = {arXiv},
  eprint       = {1512.04150},
  timestamp    = {Mon, 13 Aug 2018 16:47:46 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/ZhouKLOT15.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2008-02312,
  author       = {Ruigang Fu and
                  Qingyong Hu and
                  Xiaohu Dong and
                  Yulan Guo and
                  Yinghui Gao and
                  Biao Li},
  title        = {Axiom-based Grad-CAM: Towards Accurate Visualization and Explanation
                  of CNNs},
  journal      = {CoRR},
  volume       = {abs/2008.02312},
  year         = {2020},
  url          = {https://arxiv.org/abs/2008.02312},
  eprinttype    = {arXiv},
  eprint       = {2008.02312},
  timestamp    = {Fri, 07 Aug 2020 15:07:21 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2008-02312.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-1910-01279,
  author       = {Haofan Wang and
                  Mengnan Du and
                  Fan Yang and
                  Zijian Zhang},
  title        = {Score-CAM: Improved Visual Explanations Via Score-Weighted Class Activation
                  Mapping},
  journal      = {CoRR},
  volume       = {abs/1910.01279},
  year         = {2019},
  url          = {http://arxiv.org/abs/1910.01279},
  eprinttype    = {arXiv},
  eprint       = {1910.01279},
  timestamp    = {Thu, 04 Feb 2021 15:37:59 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1910-01279.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@INPROCEEDINGS{9093360,
author={Desai, Saurabh and Ramaswamy, Harish G.},
booktitle={2020 IEEE Winter Conference on Applications of Computer Vision (WACV)}, 
title={Ablation-CAM: Visual Explanations for Deep Convolutional Network via Gradient-free Localization}, 
year={2020},
volume={},
number={},
pages={972-980},
doi={10.1109/WACV45572.2020.9093360}}

@article{DBLP:journals/corr/abs-1912-01451,
  author       = {Richard Tomsett and
                  Dan Harborne and
                  Supriyo Chakraborty and
                  Prudhvi Gurram and
                  Alun D. Preece},
  title        = {Sanity Checks for Saliency Metrics},
  journal      = {CoRR},
  volume       = {abs/1912.01451},
  year         = {2019},
  url          = {http://arxiv.org/abs/1912.01451},
  eprinttype    = {arXiv},
  eprint       = {1912.01451},
  timestamp    = {Thu, 02 Jan 2020 18:08:18 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1912-01451.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/SamekBMBM15,
  author       = {Wojciech Samek and
                  Alexander Binder and
                  Gr{\'{e}}goire Montavon and
                  Sebastian Bach and
                  Klaus{-}Robert M{\"{u}}ller},
  title        = {Evaluating the visualization of what a Deep Neural Network has learned},
  journal      = {CoRR},
  volume       = {abs/1509.06321},
  year         = {2015},
  url          = {http://arxiv.org/abs/1509.06321},
  eprinttype    = {arXiv},
  eprint       = {1509.06321},
  timestamp    = {Mon, 13 Aug 2018 16:46:08 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/SamekBMBM15.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-2202-00449,
  author       = {Yao Rong and
                  Tobias Leemann and
                  Vadim Borisov and
                  Gjergji Kasneci and
                  Enkelejda Kasneci},
  title        = {Evaluating Feature Attribution: An Information-Theoretic Perspective},
  journal      = {CoRR},
  volume       = {abs/2202.00449},
  year         = {2022},
  url          = {https://arxiv.org/abs/2202.00449},
  eprinttype    = {arXiv},
  eprint       = {2202.00449},
  timestamp    = {Wed, 09 Feb 2022 15:43:35 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2202-00449.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{dumoulin2016guide,
  title="{A guide to convolution arithmetic for deep learning}",
  author = {{Dumoulin}, Vincent and {Visin}, Francesco},
  journal = {ArXiv e-prints},
  eprint = {1603.07285},
  year={2016},
  month={mar}
}

@article{DBLP:journals/corr/HeGDG17,
  author       = {Kaiming He and
                  Georgia Gkioxari and
                  Piotr Doll{\'{a}}r and
                  Ross B. Girshick},
  title        = {Mask {R-CNN}},
  journal      = {CoRR},
  volume       = {abs/1703.06870},
  year         = {2017},
  url          = {http://arxiv.org/abs/1703.06870},
  eprinttype    = {arXiv},
  eprint       = {1703.06870},
  timestamp    = {Mon, 13 Aug 2018 16:46:36 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/HeGDG17.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}


@Article{machines11070677,
AUTHOR = {Hussain, Muhammad},
TITLE = {YOLO-v1 to YOLO-v8, the Rise of YOLO and Its Complementary Nature toward Digital Manufacturing and Industrial Defect Detection},
JOURNAL = {Machines},
VOLUME = {11},
YEAR = {2023},
NUMBER = {7},
ARTICLE-NUMBER = {677},
URL = {https://www.mdpi.com/2075-1702/11/7/677},
ISSN = {2075-1702},
ABSTRACT = {Since its inception in 2015, the YOLO (You Only Look Once) variant of object detectors has rapidly grown, with the latest release of YOLO-v8 in January 2023. YOLO variants are underpinned by the principle of real-time and high-classification performance, based on limited but efficient computational parameters. This principle has been found within the DNA of all YOLO variants with increasing intensity, as the variants evolve addressing the requirements of automated quality inspection within the industrial surface defect detection domain, such as the need for fast detection, high accuracy, and deployment onto constrained edge devices. This paper is the first to provide an in-depth review of the YOLO evolution from the original YOLO to the recent release (YOLO-v8) from the perspective of industrial manufacturing. The review explores the key architectural advancements proposed at each iteration, followed by examples of industrial deployment for surface defect detection endorsing its compatibility with industrial requirements.},
DOI = {10.3390/machines11070677}
}

@article{DBLP:journals/corr/YosinskiCBL14,
  author       = {Jason Yosinski and
                  Jeff Clune and
                  Yoshua Bengio and
                  Hod Lipson},
  title        = {How transferable are features in deep neural networks?},
  journal      = {CoRR},
  volume       = {abs/1411.1792},
  year         = {2014},
  url          = {http://arxiv.org/abs/1411.1792},
  eprinttype    = {arXiv},
  eprint       = {1411.1792},
  timestamp    = {Mon, 13 Aug 2018 16:47:20 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/YosinskiCBL14.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@misc{liu2021swin,
      title={Swin Transformer: Hierarchical Vision Transformer using Shifted Windows}, 
      author={Ze Liu and Yutong Lin and Yue Cao and Han Hu and Yixuan Wei and Zheng Zhang and Stephen Lin and Baining Guo},
      year={2021},
      eprint={2103.14030},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}
@article{Wang_2022,
doi = {10.1007/s41095-022-0274-8},
url = {https://doi.org/10.1007%2Fs41095-022-0274-8},
year = 2022,
month = {mar},
publisher = {Springer Science and Business Media {LLC}},
volume = {8},
number = {3},
pages = {415--424},
author = {Wenhai Wang and Enze Xie and Xiang Li and Deng-Ping Fan and Kaitao Song and Ding Liang and Tong Lu and Ping Luo and Ling Shao},
title = {{PVT} v2: Improved baselines with Pyramid Vision Transformer}, 
journal = {Computational Visual Media}
}
@misc{zhai2022scaling,
      title={Scaling Vision Transformers}, 
      author={Xiaohua Zhai and Alexander Kolesnikov and Neil Houlsby and Lucas Beyer},
      year={2022},
      eprint={2106.04560},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{dosovitskiy2021image,
      title={An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale}, 
      author={Alexey Dosovitskiy and Lucas Beyer and Alexander Kolesnikov and Dirk Weissenborn and Xiaohua Zhai and Thomas Unterthiner and Mostafa Dehghani and Matthias Minderer and Georg Heigold and Sylvain Gelly and Jakob Uszkoreit and Neil Houlsby},
      year={2021},
      eprint={2010.11929},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}
@misc{vaswani2023attention,
      title={Attention Is All You Need}, 
      author={Ashish Vaswani and Noam Shazeer and Niki Parmar and Jakob Uszkoreit and Llion Jones and Aidan N. Gomez and Lukasz Kaiser and Illia Polosukhin},
      year={2023},
      eprint={1706.03762},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}
@misc{zhu2018deformable,
      title={Deformable ConvNets v2: More Deformable, Better Results}, 
      author={Xizhou Zhu and Han Hu and Stephen Lin and Jifeng Dai},
      year={2018},
      eprint={1811.11168},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}
@misc{ba2016layer,
      title={Layer Normalization}, 
      author={Jimmy Lei Ba and Jamie Ryan Kiros and Geoffrey E. Hinton},
      year={2016},
      eprint={1607.06450},
      archivePrefix={arXiv},
      primaryClass={stat.ML}
}
@misc{hendrycks2023gaussian,
      title={Gaussian Error Linear Units (GELUs)}, 
      author={Dan Hendrycks and Kevin Gimpel},
      year={2023},
      eprint={1606.08415},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}
@INPROCEEDINGS{5206848,
  author={Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Kai Li and Li Fei-Fei},
  booktitle={2009 IEEE Conference on Computer Vision and Pattern Recognition}, 
  title={ImageNet: A large-scale hierarchical image database}, 
  year={2009},
  volume={},
  number={},
  pages={248-255},
  doi={10.1109/CVPR.2009.5206848}}

@misc{touvron2021training,
      title={Training data-efficient image transformers \& distillation through attention}, 
      author={Hugo Touvron and Matthieu Cord and Matthijs Douze and Francisco Massa and Alexandre Sablayrolles and Hervé Jégou},
      year={2021},
      eprint={2012.12877},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}
@INPROCEEDINGS{8100117,
  author={Xie, Saining and Girshick, Ross and Dollár, Piotr and Tu, Zhuowen and He, Kaiming},
  booktitle={2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, 
  title={Aggregated Residual Transformations for Deep Neural Networks}, 
  year={2017},
  volume={},
  number={},
  pages={5987-5995},
  doi={10.1109/CVPR.2017.634}}


@article{DBLP:journals/corr/abs-2005-14165,
  author       = {Tom B. Brown and
                  Benjamin Mann and
                  Nick Ryder and
                  Melanie Subbiah and
                  Jared Kaplan and
                  Prafulla Dhariwal and
                  Arvind Neelakantan and
                  Pranav Shyam and
                  Girish Sastry and
                  Amanda Askell and
                  Sandhini Agarwal and
                  Ariel Herbert{-}Voss and
                  Gretchen Krueger and
                  Tom Henighan and
                  Rewon Child and
                  Aditya Ramesh and
                  Daniel M. Ziegler and
                  Jeffrey Wu and
                  Clemens Winter and
                  Christopher Hesse and
                  Mark Chen and
                  Eric Sigler and
                  Mateusz Litwin and
                  Scott Gray and
                  Benjamin Chess and
                  Jack Clark and
                  Christopher Berner and
                  Sam McCandlish and
                  Alec Radford and
                  Ilya Sutskever and
                  Dario Amodei},
  title        = {Language Models are Few-Shot Learners},
  journal      = {CoRR},
  volume       = {abs/2005.14165},
  year         = {2020},
  url          = {https://arxiv.org/abs/2005.14165},
  eprinttype    = {arXiv},
  eprint       = {2005.14165},
  timestamp    = {Thu, 25 May 2023 10:38:31 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2005-14165.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@InProceedings{pmlr-v97-tan19a,
  title = 	 {{E}fficient{N}et: Rethinking Model Scaling for Convolutional Neural Networks},
  author =       {Tan, Mingxing and Le, Quoc},
  booktitle = 	 {Proceedings of the 36th International Conference on Machine Learning},
  pages = 	 {6105--6114},
  year = 	 {2019},
  editor = 	 {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},
  volume = 	 {97},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--15 Jun},
  publisher =    {PMLR},
  pdf = 	 {http://proceedings.mlr.press/v97/tan19a/tan19a.pdf},
  url = 	 {https://proceedings.mlr.press/v97/tan19a.html},
  abstract = 	 {Convolutional Neural Networks (ConvNets) are commonly developed at a fixed resource budget, and then scaled up for better accuracy if more resources are given. In this paper, we systematically study model scaling and identify that carefully balancing network depth, width, and resolution can lead to better performance. Based on this observation, we propose a new scaling method that uniformly scales all dimensions of depth/width/resolution using a simple yet highly effective compound coefficient. We demonstrate the effectiveness of this method on MobileNets and ResNet. To go even further, we use neural architecture search to design a new baseline network and scale it up to obtain a family of models, called EfficientNets, which achieve much better accuracy and efficiency than previous ConvNets. In particular, our EfficientNet-B7 achieves stateof-the-art 84.4% top-1 / 97.1% top-5 accuracy on ImageNet, while being 8.4x smaller and 6.1x faster on inference than the best existing ConvNet (Huang et al., 2018). Our EfficientNets also transfer well and achieve state-of-the-art accuracy on CIFAR-100 (91.7%), Flower (98.8%), and 3 other transfer learning datasets, with an order of magnitude fewer parameters.}
}
@article{DBLP:journals/corr/abs-1801-04381,
  author       = {Mark Sandler and
                  Andrew G. Howard and
                  Menglong Zhu and
                  Andrey Zhmoginov and
                  Liang{-}Chieh Chen},
  title        = {Inverted Residuals and Linear Bottlenecks: Mobile Networks for Classification,
                  Detection and Segmentation},
  journal      = {CoRR},
  volume       = {abs/1801.04381},
  year         = {2018},
  url          = {http://arxiv.org/abs/1801.04381},
  eprinttype    = {arXiv},
  eprint       = {1801.04381},
  timestamp    = {Tue, 12 Jan 2021 15:30:06 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1801-04381.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@misc{Gupta_Tan2019, title={EfficientNet-EDGETPU: Creating 
accelerator-optimized neural networks with AutoML}, 
url={https://ai.googleblog.com/2019/08/efficientnet-edgetpu-creating.html},
 journal={– Google Research Blog}, author={Gupta, Suyog and Tan, 
Mingxing}} 


@article{DBLP:journals/corr/abs-1911-09070,
  author       = {Mingxing Tan and
                  Ruoming Pang and
                  Quoc V. Le},
  title        = {EfficientDet: Scalable and Efficient Object Detection},
  journal      = {CoRR},
  volume       = {abs/1911.09070},
  year         = {2019},
  url          = {http://arxiv.org/abs/1911.09070},
  eprinttype    = {arXiv},
  eprint       = {1911.09070},
  timestamp    = {Tue, 03 Dec 2019 14:15:54 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1911-09070.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@InProceedings{Lin_2017_CVPR,
author = {Lin, Tsung-Yi and Dollar, Piotr and Girshick, Ross and He, Kaiming and Hariharan, Bharath and Belongie, Serge},
title = {Feature Pyramid Networks for Object Detection},
booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
month = {July},
year = {2017}
}
@article{DBLP:journals/corr/abs-1803-01534,
  author       = {Shu Liu and
                  Lu Qi and
                  Haifang Qin and
                  Jianping Shi and
                  Jiaya Jia},
  title        = {Path Aggregation Network for Instance Segmentation},
  journal      = {CoRR},
  volume       = {abs/1803.01534},
  year         = {2018},
  url          = {http://arxiv.org/abs/1803.01534},
  eprinttype    = {arXiv},
  eprint       = {1803.01534},
  timestamp    = {Wed, 11 Sep 2019 15:40:23 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1803-01534.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1904-07392,
  author       = {Golnaz Ghiasi and
                  Tsung{-}Yi Lin and
                  Ruoming Pang and
                  Quoc V. Le},
  title        = {{NAS-FPN:} Learning Scalable Feature Pyramid Architecture for Object
                  Detection},
  journal      = {CoRR},
  volume       = {abs/1904.07392},
  year         = {2019},
  url          = {http://arxiv.org/abs/1904.07392},
  eprinttype    = {arXiv},
  eprint       = {1904.07392},
  timestamp    = {Thu, 25 Apr 2019 13:55:01 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1904-07392.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}


@article{adelson1984pmi,
  added-at = {2011-09-19T12:12:54.000+0200},
  author = {Adelson, E. H. and Anderson, C. H. and Bergen, J. R. and Burt, P. J. and Ogden, J. M.},
  biburl = {https://www.bibsonomy.org/bibtex/259dfac6a273a879eb5c33f0f5b740980/sac},
  citeulike-article-id = {1622723},
  interhash = {1b86abb78a10e821d19471cbc87bbe0e},
  intrahash = {59dfac6a273a879eb5c33f0f5b740980},
  journal = {RCA Engineer},
  keywords = {deepzoom image ma10 processing pyramid},
  number = 6,
  pages = {33--41},
  posted-at = {2007-09-05 11:12:27},
  priority = {0},
  timestamp = {2011-09-19T12:12:54.000+0200},
  title = {{1984, Pyramid methods in image processing}},
  volume = 29,
  year = 1984
}


@article{DBLP:journals/corr/ZophL16,
  author       = {Barret Zoph and
                  Quoc V. Le},
  title        = {Neural Architecture Search with Reinforcement Learning},
  journal      = {CoRR},
  volume       = {abs/1611.01578},
  year         = {2016},
  url          = {http://arxiv.org/abs/1611.01578},
  eprinttype    = {arXiv},
  eprint       = {1611.01578},
  timestamp    = {Mon, 13 Aug 2018 16:46:24 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/ZophL16.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/ZophVSL17,
  author       = {Barret Zoph and
                  Vijay Vasudevan and
                  Jonathon Shlens and
                  Quoc V. Le},
  title        = {Learning Transferable Architectures for Scalable Image Recognition},
  journal      = {CoRR},
  volume       = {abs/1707.07012},
  year         = {2017},
  url          = {http://arxiv.org/abs/1707.07012},
  eprinttype    = {arXiv},
  eprint       = {1707.07012},
  timestamp    = {Mon, 13 Aug 2018 16:48:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/ZophVSL17.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/GirshickDDM13,
  author       = {Ross B. Girshick and
                  Jeff Donahue and
                  Trevor Darrell and
                  Jitendra Malik},
  title        = {Rich feature hierarchies for accurate object detection and semantic
                  segmentation},
  journal      = {CoRR},
  volume       = {abs/1311.2524},
  year         = {2013},
  url          = {http://arxiv.org/abs/1311.2524},
  eprinttype    = {arXiv},
  eprint       = {1311.2524},
  timestamp    = {Mon, 13 Aug 2018 16:48:09 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/GirshickDDM13.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{Uijlings2013,
  doi = {10.1007/s11263-013-0620-5},
  url = {https://doi.org/10.1007/s11263-013-0620-5},
  year = {2013},
  month = apr,
  publisher = {Springer Science and Business Media {LLC}},
  volume = {104},
  number = {2},
  pages = {154--171},
  author = {J. R. R. Uijlings and K. E. A. van de Sande and T. Gevers and A. W. M. Smeulders},
  title = {Selective Search for Object Recognition},
  journal = {International Journal of Computer Vision}
}
@article{DBLP:journals/corr/Girshick15,
  author       = {Ross B. Girshick},
  title        = {Fast {R-CNN}},
  journal      = {CoRR},
  volume       = {abs/1504.08083},
  year         = {2015},
  url          = {http://arxiv.org/abs/1504.08083},
  eprinttype    = {arXiv},
  eprint       = {1504.08083},
  timestamp    = {Mon, 13 Aug 2018 16:49:11 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/Girshick15.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/RenHG015,
  author       = {Shaoqing Ren and
                  Kaiming He and
                  Ross B. Girshick and
                  Jian Sun},
  title        = {Faster {R-CNN:} Towards Real-Time Object Detection with Region Proposal
                  Networks},
  journal      = {CoRR},
  volume       = {abs/1506.01497},
  year         = {2015},
  url          = {http://arxiv.org/abs/1506.01497},
  eprinttype    = {arXiv},
  eprint       = {1506.01497},
  timestamp    = {Mon, 13 Aug 2018 16:46:02 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/RenHG015.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{LeCun2015,
  doi = {10.1038/nature14539},
  url = {https://doi.org/10.1038/nature14539},
  year = {2015},
  month = may,
  publisher = {Springer Science and Business Media {LLC}},
  volume = {521},
  number = {7553},
  pages = {436--444},
  author = {Yann LeCun and Yoshua Bengio and Geoffrey Hinton},
  title = {Deep learning},
  journal = {Nature}
}

@Article{rs10020299,
AUTHOR = {Gao, Qishuo and Lim, Samsung and Jia, Xiuping},
TITLE = {Hyperspectral Image Classification Using Convolutional Neural Networks and Multiple Feature Learning},
JOURNAL = {Remote Sensing},
VOLUME = {10},
YEAR = {2018},
NUMBER = {2},
ARTICLE-NUMBER = {299},
URL = {https://www.mdpi.com/2072-4292/10/2/299},
ISSN = {2072-4292},
ABSTRACT = {Convolutional neural networks (CNNs) have been extended to hyperspectral imagery (HSI) classification due to its better feature representation and high performance, whereas multiple feature learning has shown its effectiveness in computer vision areas. This paper proposes a novel framework that takes advantage of both CNNs and multiple feature learning to better predict the class labels for HSI pixels. We built a novel CNN architecture with various features extracted from the raw imagery as input. The network generates the corresponding relevant feature maps for the input, and the generated feature maps are fed into a concatenating layer to form a joint feature map. The obtained joint feature map is then input to the subsequent layers to predict the final labels for each hyperspectral pixel. The proposed method not only takes advantage of enhanced feature extraction from CNNs, but also fully exploits the spectral and spatial information jointly. The effectiveness of the proposed method is tested with three benchmark data sets, and the results show that the CNN-based multi-feature learning framework improves the classification accuracy significantly.},
DOI = {10.3390/rs10020299}
}
@ARTICLE{7214350,
  author={Lahat, Dana and Adali, Tülay and Jutten, Christian},
  journal={Proceedings of the IEEE}, 
  title={Multimodal Data Fusion: An Overview of Methods, Challenges, and Prospects}, 
  year={2015},
  volume={103},
  number={9},
  pages={1449-1477},
  doi={10.1109/JPROC.2015.2460697}}
@inproceedings{NIPS2012_c399862d,
 author = {Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E},
 booktitle = {Advances in Neural Information Processing Systems},
 editor = {F. Pereira and C.J. Burges and L. Bottou and K.Q. Weinberger},
 pages = {},
 publisher = {Curran Associates, Inc.},
 title = {ImageNet Classification with Deep Convolutional Neural Networks},
 url = {https://proceedings.neurips.cc/paper_files/paper/2012/file/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf},
 volume = {25},
 year = {2012}
}
@Article{s19071599,
AUTHOR = {Uddin, Md Azher and Lee, Young-Koo},
TITLE = {Feature Fusion of Deep Spatial Features and Handcrafted Spatiotemporal Features for Human Action Recognition},
JOURNAL = {Sensors},
VOLUME = {19},
YEAR = {2019},
NUMBER = {7},
ARTICLE-NUMBER = {1599},
URL = {https://www.mdpi.com/1424-8220/19/7/1599},
PubMedID = {30987018},
ISSN = {1424-8220},
ABSTRACT = {Human action recognition plays a significant part in the research community due to its emerging applications. A variety of approaches have been proposed to resolve this problem, however, several issues still need to be addressed. In action recognition, effectively extracting and aggregating the spatial-temporal information plays a vital role to describe a video. In this research, we propose a novel approach to recognize human actions by considering both deep spatial features and handcrafted spatiotemporal features. Firstly, we extract the deep spatial features by employing a state-of-the-art deep convolutional network, namely Inception-Resnet-v2. Secondly, we introduce a novel handcrafted feature descriptor, namely Weber&rsquo;s law based Volume Local Gradient Ternary Pattern (WVLGTP), which brings out the spatiotemporal features. It also considers the shape information by using gradient operation. Furthermore, Weber&rsquo;s law based threshold value and the ternary pattern based on an adaptive local threshold is presented to effectively handle the noisy center pixel value. Besides, a multi-resolution approach for WVLGTP based on an averaging scheme is also presented. Afterward, both these extracted features are concatenated and feed to the Support Vector Machine to perform the classification. Lastly, the extensive experimental analysis shows that our proposed method outperforms state-of-the-art approaches in terms of accuracy.},
DOI = {10.3390/s19071599}
}
@INPROCEEDINGS{8099726,
  author={Huang, Gao and Liu, Zhuang and Van Der Maaten, Laurens and Weinberger, Kilian Q.},
  booktitle={2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, 
  title={Densely Connected Convolutional Networks}, 
  year={2017},
  volume={},
  number={},
  pages={2261-2269},
  doi={10.1109/CVPR.2017.243}}


@misc{hatamizadeh2023global,
      title={Global Context Vision Transformers}, 
      author={Ali Hatamizadeh and Hongxu Yin and Greg Heinrich and Jan Kautz and Pavlo Molchanov},
      year={2023},
      eprint={2206.09959},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}
@article{DBLP:journals/corr/abs-2009-14082,
  author       = {Yimian Dai and
                  Fabian Gieseke and
                  Stefan Oehmcke and
                  Yiquan Wu and
                  Kobus Barnard},
  title        = {Attentional Feature Fusion},
  journal      = {CoRR},
  volume       = {abs/2009.14082},
  year         = {2020},
  url          = {https://arxiv.org/abs/2009.14082},
  eprinttype    = {arXiv},
  eprint       = {2009.14082},
  timestamp    = {Wed, 30 Sep 2020 16:16:22 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2009-14082.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/LiuRB15,
  author       = {Wei Liu and
                  Andrew Rabinovich and
                  Alexander C. Berg},
  title        = {ParseNet: Looking Wider to See Better},
  journal      = {CoRR},
  volume       = {abs/1506.04579},
  year         = {2015},
  url          = {http://arxiv.org/abs/1506.04579},
  eprinttype    = {arXiv},
  eprint       = {1506.04579},
  timestamp    = {Mon, 13 Aug 2018 16:48:41 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/LiuRB15.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1709-01507,
  author       = {Jie Hu and
                  Li Shen and
                  Gang Sun},
  title        = {Squeeze-and-Excitation Networks},
  journal      = {CoRR},
  volume       = {abs/1709.01507},
  year         = {2017},
  url          = {http://arxiv.org/abs/1709.01507},
  eprinttype    = {arXiv},
  eprint       = {1709.01507},
  timestamp    = {Wed, 11 Aug 2021 09:47:11 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1709-01507.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{NEURIPS2019_bdbca288,
 author = {Paszke, Adam and Gross, Sam and Massa, Francisco and Lerer, Adam and Bradbury, James and Chanan, Gregory and Killeen, Trevor and Lin, Zeming and Gimelshein, Natalia and Antiga, Luca and Desmaison, Alban and Kopf, Andreas and Yang, Edward and DeVito, Zachary and Raison, Martin and Tejani, Alykhan and Chilamkurthy, Sasank and Steiner, Benoit and Fang, Lu and Bai, Junjie and Chintala, Soumith},
 booktitle = {Advances in Neural Information Processing Systems},
 editor = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\textquotesingle Alch\'{e}-Buc and E. Fox and R. Garnett},
 pages = {},
 publisher = {Curran Associates, Inc.},
 title = {PyTorch: An Imperative Style, High-Performance Deep Learning Library},
 url = {https://proceedings.neurips.cc/paper_files/paper/2019/file/bdbca288fee7f92f2bfa9f7012727740-Paper.pdf},
 volume = {32},
 year = {2019}
}

@misc{jacobgilpytorchcam,
  title={PyTorch library for CAM methods},
  author={Jacob Gildenblat and contributors},
  year={2021},
  publisher={GitHub},
  howpublished={\url{https://github.com/jacobgil/pytorch-grad-cam}},
}

@article{DBLP:journals/corr/LinMBHPRDZ14,
  author       = {Tsung{-}Yi Lin and
                  Michael Maire and
                  Serge J. Belongie and
                  Lubomir D. Bourdev and
                  Ross B. Girshick and
                  James Hays and
                  Pietro Perona and
                  Deva Ramanan and
                  Piotr Doll{\'{a}}r and
                  C. Lawrence Zitnick},
  title        = {Microsoft {COCO:} Common Objects in Context},
  journal      = {CoRR},
  volume       = {abs/1405.0312},
  year         = {2014},
  url          = {http://arxiv.org/abs/1405.0312},
  eprinttype    = {arXiv},
  eprint       = {1405.0312},
  timestamp    = {Mon, 13 Aug 2018 16:48:13 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/LinMBHPRDZ14.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/corr/abs-1711-08189,
  author       = {Bharat Singh and
                  Larry S. Davis},
  title        = {An Analysis of Scale Invariance in Object Detection - {SNIP}},
  journal      = {CoRR},
  volume       = {abs/1711.08189},
  year         = {2017},
  url          = {http://arxiv.org/abs/1711.08189},
  eprinttype    = {arXiv},
  eprint       = {1711.08189},
  timestamp    = {Mon, 13 Aug 2018 16:47:19 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1711-08189.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@Article{info11020125,
    AUTHOR = {Buslaev, Alexander and Iglovikov, Vladimir I. and Khvedchenya, Eugene and Parinov, Alex and Druzhinin, Mikhail and Kalinin, Alexandr A.},
    TITLE = {Albumentations: Fast and Flexible Image Augmentations},
    JOURNAL = {Information},
    VOLUME = {11},
    YEAR = {2020},
    NUMBER = {2},
    ARTICLE-NUMBER = {125},
    URL = {https://www.mdpi.com/2078-2489/11/2/125},
    ISSN = {2078-2489},
    DOI = {10.3390/info11020125}
}
@misc{kingma2017adam,
      title={Adam: A Method for Stochastic Optimization}, 
      author={Diederik P. Kingma and Jimmy Ba},
      year={2017},
      eprint={1412.6980},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@article{DBLP:journals/corr/LoshchilovH16a,
  author       = {Ilya Loshchilov and
                  Frank Hutter},
  title        = {{SGDR:} Stochastic Gradient Descent with Restarts},
  journal      = {CoRR},
  volume       = {abs/1608.03983},
  year         = {2016},
  url          = {http://arxiv.org/abs/1608.03983},
  eprinttype    = {arXiv},
  eprint       = {1608.03983},
  timestamp    = {Mon, 13 Aug 2018 16:48:29 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/LoshchilovH16a.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{OpenImages2,
  title={OpenImages: A public dataset for large-scale multi-label and multi-class image classification.},
  author={Krasin, Ivan and Duerig, Tom and Alldrin, Neil and Ferrari, Vittorio and Abu-El-Haija, Sami and Kuznetsova, Alina and Rom, Hassan and Uijlings, Jasper and Popov, Stefan and Kamali, Shahab and Malloci, Matteo and Pont-Tuset, Jordi and Veit, Andreas and Belongie, Serge and Gomes, Victor and Gupta, Abhinav and Sun, Chen and Chechik, Gal and Cai, David and Feng, Zheyun and Narayanan, Dhyanesh and Murphy, Kevin},
  journal={Dataset available from https://storage.googleapis.com/openimages/web/index.html},
  year={2017}
}