diff --git a/README.md b/README.md index 1db912e..0218462 100644 --- a/README.md +++ b/README.md @@ -2,10 +2,10 @@ ## Introduction -This repository gathers the list of online publicly available bioacoustics datasets that can be used together with deep learning. You can visualise this table at the following url: https://bioacoustic-ai.github.io/bioacoustics-datasets/. This list aims at providing an overview of what data is currently accessible to train and evaluate models. We aim at keeping this list up-to-date, and you can contribute by opening a pull request to add new datasets. The procedure to add a new dataset is explained in the next section. +This repository gathers the list of online publicly available bioacoustics datasets that can be used together with deep learning. You can visualise this table at the following url: https://bioacoustic-ai.github.io/bioacoustics-datasets/. This list aims at providing an overview of what data is currently accessible to train and evaluate models. Note that this work is **in progress**, and missing information will be progressively added. We aim at keeping this list up-to-date, and you can contribute by opening a pull request to add new datasets. The procedure to add a new dataset is explained in the next section. ## Want to contribute? -That's great! You can contribute in two ways. The first way to contribute is to add more information on the already listed datasets, or add another dataset. In order to modify the existing datasets, you only need to modify its json file, located in the directory `datasets_json`. To add a new dataset, download the file called `dataset_template.json`. Then, open it and fill as many fields as possible. We recommend to fill at least the fields `authors`, `description`, `url`, `version`, and `license`. You will find a description of each of the fields below. You can also open a github issue to refer us to a new dataset if you do not wish to fill the information yourself. +That's great! You can contribute in two ways. The first way to contribute is to add more information on the already listed datasets, or add another dataset. In order to modify the existing datasets, you only need to modify its json file, located in the directory `datasets_json`. To add a new dataset, download the file called `dataset_template.json`. Then, open it and fill as many fields as possible. Finally, place it in the folder `datasets_json` and open a pull request. We recommend to fill at least the fields `authors`, `description`, `url`, `version`, and `license`. You will find a description of each of the fields below. You can also open a github issue to refer us to a new dataset if you do not wish to fill the information yourself. The second way to contribute is to make the webapp better, either by proposing improvements in github issues, or by implementing them directly. Read the README file in the `frontend` folder to run the webapp locally. diff --git a/dataset_template.json b/dataset_template.json index 31396c9..99a66c3 100644 --- a/dataset_template.json +++ b/dataset_template.json @@ -2,7 +2,6 @@ "additionalDescription": "", "annotationsType": "", "captureDevice": "", - "citeAs": "", "continent": "", "countryCode": "", "creators": "", @@ -18,14 +17,13 @@ "numAudioFiles": "", "numClasses": "", "numSpecies": "", - "paperLink": "", + "paperLink": null, "physicalSetting": "", "provider": "", "recordingPeriod": "", "recordingType": "", "sampleRate": "", "sizeInGb": "", - "species": "", "taxonomicClass": "", "totalDuration": "", "url": "", diff --git a/datasets_json/BlueFinLibrary.json b/datasets_json/BlueFinLibrary.json index 4c274d2..94457de 100644 --- a/datasets_json/BlueFinLibrary.json +++ b/datasets_json/BlueFinLibrary.json @@ -23,7 +23,7 @@ "recordingPeriod": "From 2005 to 2017", "recordingType": "Continuous", "sampleRate": "", - "sizeInGb": "> 5 ", + "sizeInGb": 5, "taxonomicClass": "Marine Mammals", "totalDuration": 1880.25, "url": "https://data.aad.gov.au/metadata/records/AcousticTrends_BlueFinLibrary", diff --git a/datasets_json/InsectSound1000.json b/datasets_json/InsectSound1000.json new file mode 100644 index 0000000..6ed3141 --- /dev/null +++ b/datasets_json/InsectSound1000.json @@ -0,0 +1,37 @@ +{ + "additionalDescription": "Recordings made in an anechoic box.", + "annotationsType": "Species", + "captureDevice": "Four-channel low-noise measurement microphone array", + "continent": "Europe", + "countryCode": "DE", + "creators": [ + "Jelto Branding", + "Dieter von Hörsten", + "Elias Böckmann", + "Jens Karl Wegener", + "Eberhard Hartung" + ], + "datePublished": "2024-04-23", + "description": "InsectSound1000 is a dataset comprising more than 169000 labelled sound samples of 12 insects species. The insect sound level spans from very loud (Bombus terrestris) to inaudible to human ears (Aphidoletes aphidimyza). The samples were extracted from more than 1000 h of recordings made in an anechoic box with a four-channel low-noise measurement microphone array. Each sample is a four-channel wave-file with a duration of 2500 ms, at 16 kHz sample rate and 32 bit resolution.", + "labellingLevel": "Weak", + "license": "CC BY 4.0", + "lifeStage": "", + "locality": "Julius Kühn-Institute", + "minAndMaxRecordingDuration": "2.5 - 2.5", + "name": "InsectSound1000", + "numAnnotations": 165982, + "numAudioFiles": 165982, + "numClasses": 12, + "numSpecies": 12, + "paperLink": " https://doi.org/10.1038/s41597-024-03301-4", + "physicalSetting": "Artificial", + "provider": "Julius Kühn-Institute", + "recordingPeriod": "72 recording nights", + "recordingType": "Clips", + "sampleRate": 16, + "sizeInGb": 95.1, + "taxonomicClass": "Insecta", + "totalDuration": 115, + "url": "https://doi.org/10.5073/20231024-173119-0", + "version": "" +} \ No newline at end of file diff --git a/datasets_json/MT (Meerkats).json b/datasets_json/MT (Meerkats).json index c4f2d12..dcf6abb 100644 --- a/datasets_json/MT (Meerkats).json +++ b/datasets_json/MT (Meerkats).json @@ -1,5 +1,5 @@ { - "additionalDescription": "", + "additionalDescription": "Is a subset of the MeerKAT dataset.", "annotationsType": "Vocalisation type", "captureDevice": "Collars (TS Market, Edic Mini Tiny+ A77)", "continent": "Africa", diff --git a/datasets_json/MeerKAT.json b/datasets_json/MeerKAT.json new file mode 100644 index 0000000..7753a17 --- /dev/null +++ b/datasets_json/MeerKAT.json @@ -0,0 +1,44 @@ +{ + "additionalDescription": "184 hours are labelled.", + "annotationsType": "time-bounded vox type, focality", + "captureDevice": "acoustic collars (Edic Mini Tiny+ A77), and Marantz PMD661 digital recorders (Carlsbad, CA, U.S.) attached to directional Sennheiser ME66 microphones", + "continent": "Africa", + "countryCode": "ZA", + "creators": [ + "Julian Schaefer-Zimmermann", + "Vlad Demartsev", + "Baptiste Averly", + "Kiran Dhanjal-Adams", + "Mathieu Duteil", + "Gabriella Gall", + "Marius Faiß", + "Lily Johnson-Ulrich", + "Dan Stowell", + "Marta Manser", + "Marie Roch", + "Ariana Strandburg-Peshkin" + ], + "datePublished": "2024-07-02", + "description": "MeerKAT is a 1068h large-scale dataset containing data from boom-mics and audio-recording collars worn by free-ranging meerkats (Suricata suricatta) at the Kalahari Research Centre, South Africa, of which 184h are labeled with twelve time-resolved vocalization-type ground truth target classes, each with millisecond resolution. The labeled 184h MeerKAT subset exhibits realistic sparsity conditions for a bioacoustic dataset (96% background-noise or other signals and 4% vocalizations), dispersed across 66398 10-second samples, spanning 251562 labeled events and showcasing significant spectral and temporal variability, making it a large-scale reference point with real-world conditions for benchmarking pretraining and finetuning approaches in bioacoustics deep learning.", + "labellingLevel": "Strong", + "license": "CC BY-NC 4.0", + "lifeStage": "Adults", + "locality": "Kalahari Research Centre", + "minAndMaxRecordingDuration": "10 - 10", + "name": "MeerKAT", + "numAnnotations": 251562, + "numAudioFiles": 384592, + "numClasses": 12, + "numSpecies": 1, + "paperLink": "https://arxiv.org/abs/2406.01253", + "physicalSetting": "Natural", + "provider": " Kalahari Research Centre ", + "recordingPeriod": "Aug-Sep 2017 and Jul-Aug 2019", + "recordingType": "Clips", + "sampleRate": 8, + "sizeInGb": 61, + "taxonomicClass": "Mammalia", + "totalDuration": "1068", + "url": "https://doi.org/10.17617/3.0J0DYB", + "version": 1.2 +} \ No newline at end of file