datasets.yaml

#
# Copyright 2020 IBM Corp. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

api_name: com.ibm.pardata.v1
last_updated: 2021-11-30
datasets:
  airline:
    "1.0.0":
      name: Airline Reporting Carrier On-Time Performance datasets
      published: 2020-06-25
      homepage: https://developer.ibm.com/exchanges/data/all/airline/
      download_url: https://dax-cdn.cdn.appdomain.cloud/dax-airline/1.0.1/lax_to_jfk.tar.gz
      sha512sum: 030d2e2d74a4dca03f5f471563226f68b9994ffb1cff2aaf9a50d39695f8c49ea477e87cf4981a09c7fdba8e1967b119e6f6bf52113d9cc1c09856c5e3cef064
      license: CDLA-Sharing-1.0
      estimated_size: 58K
      description: "Information about flights from LAX to JFK that were reported to the US Bureau of Transportation Statistics."
      subdatasets:
        lax_to_jfk:
          name: LAX to JFK
          description: "LAX to JFK flights sample dataset."
          format:
            id: table/csv
            options:
              columns:
                FlightDate: 'datetime'
          path: lax_to_jfk/lax_to_jfk.csv
  claim_sentences_search:
    "1.0.2":
      name: IBM Debater® Claim Sentences Search
      published: 2019-08-19
      homepage: https://developer.ibm.com/exchanges/data/all/claim-sentences-search/
      download_url: https://dax-cdn.cdn.appdomain.cloud/dax-claim-sentences-search/1.0.2/claim-sentences-search.tar.gz
      sha512sum: ca075c45338ab261d11d60bcd11bd52041f68078a3be617171caaf6291c24a72535500706be7be3b4b1dea5d93f7e1d739e4393de0fdeba7eacd019e9f70171b
      license: CC-BY-SA-3.0
      estimated_size: 571M
      description: "Sentences from Wikipedia together with their topic."
      subdatasets:
        train:
          name: MC Query Train
          description: Sentences retrieved by the q_mc query on 70 train topics
          format:
            id: table/csv
            options:
              columns:
                id: 'string'
                topic: 'string'
                mc: 'string'
                sentence: 'string'
                suffix: 'string'
                prefix: 'string'
                url: 'string'
          path: q_mc_train.csv
        heldout:
          name: MC Query Heldout
          description: Sentences retrieved by the q_mc query on 30 heldout topics
          format:
            id: table/csv
            options:
              columns:
                id: 'string'
                topic: 'string'
                mc: 'string'
                sentence: 'string'
                suffix: 'string'
                prefix: 'string'
                url: 'string'
          path: q_mc_heldout.csv
        test:
          name: MC Query Test
          description: Sentences retrieved by the q_mc query on 50 test topics
          format:
            id: table/csv
            options:
              columns:
                id: 'string'
                topic: 'string'
                mc: 'string'
                sentence: 'string'
                suffix: 'string'
                prefix: 'string'
                url: 'string'
          path: q_mc_test.csv
        test_set:
          name: DNN Test
          description: Top predictions of our system along with their labels
          format:
            id: table/csv
            options:
              columns:
                id: 'string'
                topic: 'string'
                mc: 'string'
                sentence: 'string'
                query_pattern: 'string'
                score: 'float'
                label: 'boolean'
                url: 'string'
          path: test_set.csv
  concept_abstractness:
    "1.0.2":
      name: IBM Debater® Concept Abstractness
      published: 2019-07-29
      homepage: https://developer.ibm.com/exchanges/data/all/concept-abstractness/
      download_url: https://dax-cdn.cdn.appdomain.cloud/dax-concept-abstractness/1.0.2/concept-abstractness.tar.gz
      sha512sum: 25cb76c0a8fdfc9cae7e050d4c2492bf055f97a20fa85690b9aaf7dcf965a705fc89e32aed7fbb6d418432e5368cb06fc3bb0f1ab85807fec8aef9df3965cc06
      license: CC-BY-SA-3.0
      estimated_size: 3.6M
      description: "A set of concepts from Wikipedia rated for their degree of abstractness."
      subdatasets:
        unigrams:
          name: Unigrams
          description: "Concepts and abstractness scores for unigrams (single worded concepts)"
          format: table/csv
          options:
            columns:
              Concept: 'string'
              Score: 'float'
          path: prediction_unigrams.csv
        bigrams:
          name: Bigrams
          description: "Concepts and abstractness scores for bigrams (two word concepts)"
          format: table/csv
          options:
            columns:
              Concept: 'string'
              Score: 'float'
          path: prediction_bigrams.csv
        trigrams:
          name: Trigrams
          description: "Concepts and abstractness scores for trigrams (three word concepts)"
          format: table/csv
          options:
            columns:
              Concept: 'string'
              Score: 'float'
          path: prediction_trigrams.csv
  covid19_questions:
    "1.0.0":
      name: COVID-19 Questions
      published: 2020-10-01
      homepage: https://developer.ibm.com/exchanges/data/all/cqa/
      download_url: https://dax-cdn.cdn.appdomain.cloud/dax-covid-19-questions/1.0.0/covid-19-questions.tar.gz
      sha512sum: f0cd0689b51277e14361dcd110ad6268fd88c8bdd9d6254f15953e539d0f847bbfd4981d1cc5cf0045fb1c159938e30d0b4e7ab68c20ccec5ee7db6ff3622b41
      license: CDLA-Sharing-1.0
      estimated_size: 49K
      description: "Categorized questions which were frequently asked by the public during the COVID-19 pandemic period."
      subdatasets:
        full:
          name: COVID-19 Questions
          description: "Full dataset."
          format:
            id: table/csv
            options:
              columns:
                label: 'string'
                utterance: 'string'
              delimiter: "\t"
          path: covid-19-questions/covid_19_questions.tsv
  expert_in_the_loop_ai_polymer_discovery:
    "1.0.0":
      name: Expert in the Loop AI – Polymer Discovery
      published: 2020-06-15
      homepage: https://developer.ibm.com/exchanges/data/all/expert-in-the-loop-ai-polymer-discovery/
      download_url: https://dax-cdn.cdn.appdomain.cloud/dax-expert-in-the-loop-ai-polymer-discovery/1.0.0/expert-in-the-loop-ai-polymer-discovery.tar.gz
      sha512sum: a87db5c8a30288fd2dc41c1189125378c0700035f7c3a2ca6c71bfbc4b955c15f2bb4f955d2b4a3e95b06ee3719fbd1082c79b299cf688da7af6baedb333de4c
      license: CDLA-Sharing-1.0
      estimated_size: 14M
      description: "Candidate monomers for ring-opening polymerization adjudicated/labelled for workability."
      subdatasets:
        full:
          name: Expert in the Loop AI Full Dataset
          description: Full version of the Expert in the Loop AI dataset.
          format:
            id: table/csv
            options:
              columns:
                Sequence: 'string'
                Adjudication: 'string'
          path: expert-in-the-loop-ai-polymer-discovery/EITLAI_CyclicLactones_ROP_v1.csv
  fashion_mnist:
    "1.0.0":
      name: Fashion-MNIST
      published: 2019-08-22
      homepage: https://developer.ibm.com/exchanges/data/all/fashion-mnist/
      download_url: https://dax-cdn.cdn.appdomain.cloud/dax-fashion-mnist/1.0.2/fashion-mnist.tar.gz
      sha512sum: dd89a49011582fd9c6825a74a8af0a4d72cdb8a8a3bc9e300c7f0bf92cf83a251af58ef24cf66ec5047c84c17f1d607bb3c648a6dbb89e1d4935a654ea6befc6
      license: MIT
      estimated_size: 37M
      description: "The Fashion-MNIST dataset contains 60,000 training images (and 10,000 test images) of fashion and clothing items, taken from 10 classes. Each image is a standardized 28×28 size in grayscale (784 total pixels). Fashion-MNIST was created by Zalando as a compatible replacement for the original MNIST dataset of handwritten digits."
      subdatasets:
        train:
          name: Training dataset
          description: Training dataset in Fashion-MNIST.
          format:
            id: table/csv
          path: fashion-mnist_train.csv
        test:
          name: Testing dataset
          description: Testing dataset in Fashion-MNIST.
          format:
            id: table/csv
          path: fashion-mnist_test.csv
  gmb:
    "1.0.2":
      name: Groningen Meaning Bank Modified
      published: 2019-12-19
      homepage: https://developer.ibm.com/exchanges/data/all/groningen-meaning-bank/
      download_url: https://dax-cdn.cdn.appdomain.cloud/dax-groningen-meaning-bank-modified/1.0.2/groningen-meaning-bank-modified.tar.gz
      sha512sum: 4b0e6c445bf5be0573ae411f8e0ba307b884300ab6b5473ea0d455dd82b8cf4dc06fb77a9a606850f3b283357f22fd516e91850cea7e45de19ce5625fda2c001
      license: CDLA-Sharing-1.0
      estimated_size: 10M
      description: "A dataset of multi-sentence texts, together with annotations for parts-of-speech, named entities, lexical categories and other natural language structural phenomena."
      subdatasets:
        gmb_subset_full:
          name: GMB Subset Full
          description: A full version of the raw dataset. Used to train MAX model – Named Entity Tagger.
          format: text/plain
          path: groningen_meaning_bank_modified/gmb_subset_full.txt
  noaa_jfk:
    "1.1.4":
      name: NOAA Weather Data – JFK Airport
      published: 2019-09-12
      homepage: https://developer.ibm.com/exchanges/data/all/jfk-weather-data/
      download_url: https://dax-cdn.cdn.appdomain.cloud/dax-noaa-weather-data-jfk-airport/1.1.4/noaa-weather-data-jfk-airport.tar.gz
      sha512sum: e3f27a8fcc0db5289df356e3f48aef6df56236798d5b3ae3889d358489ec6609d2d797e4c4932b86016d2ce4a379ac0a0749b6fb2c293ebae4e585ea1c8422ac
      license: CDLA-Sharing-1.0
      estimated_size: 3.2M
      description: "The NOAA JFK dataset contains 114,546 hourly observations of various local climatological variables (including visibility, temperature, wind speed and direction, humidity, dew point, and pressure). The data was collected by a NOAA weather station located at the John F. Kennedy International Airport in Queens, New York."
      subdatasets:
        jfk_weather_cleaned:
          name: Cleaned JFK Weather Data
          description: Cleaned version of the JFK weather data.
          format:
            id: table/csv
            options:
              columns:
                DATE: 'datetime'
          path: noaa-weather-data-jfk-airport/jfk_weather_cleaned.csv
  taranaki_basin_curated_well_logs:
    "1.0.0":
      name: Taranaki Basin Curated Well Logs
      published: 2020-05-10
      homepage: https://developer.ibm.com/exchanges/data/all/taranaki-basin-curated-well-logs/
      download_url: https://dax-cdn.cdn.appdomain.cloud/dax-taranaki-basin-curated-well-logs/1.0.0/taranaki-basin-curated-well-logs.tar.gz
      sha512sum: 6f1a8e051806b934e650dca7f69c58505635303ea3a47bfa36da175056669e5a09057fac8895d4f01db74a547c63ce6c6b8959854f10d6265cda71c377d06183
      license: CDLA-Sharing-1.0
      estimated_size: 873M
      description: "A dataset containing well logs curated from the 2016 New Zealand Petroleum Exploration public data pack."
      subdatasets:
        logs:
          name: Taranaki Basin Curated Well Logs
          description: Log information
          format:
            id: table/csv
            options:
              columns:
                BS: 'float'
                CALI: 'float'
                DENS: 'float'
                DRHO: 'float'
                DTC: 'float'
                GR: 'float'
                NEUT: 'float'
                PEF: 'float'
                RESD: 'float'
                RESM: 'float'
                RESS: 'float'
                SP: 'float'
                TEMP: 'float'
                TENS: 'float'
                X: 'float'
                Y: 'float'
                Z: 'float'
                DEPT: 'float'
                ONSHORE: 'boolean'
                DIRSURVEY: 'boolean'
                WELLNAME: 'string'
                FORMATION: 'string'
          path: taranaki-basin-curated-well-logs/logs.csv
        coordinates:
          name: Taranaki Basin Curated Well Coordinates
          description: Coordinates of the wells
          format:
            id: table/csv
            options:
              columns:
                WELL_NAME: 'string'
                LAT: 'float'
                LONG: 'float'
                X: 'float'
                Y: 'float'
          path: taranaki-basin-curated-well-logs/coords.csv
  tensorflow_speech_commands:
    "1.0.0":
      name: TensorFlow Speech Commands
      published: 2020-03-17
      homepage: https://developer.ibm.com/exchanges/data/all/speech-commands/
      download_url: https://dax-cdn.cdn.appdomain.cloud/dax-tensorflow-speech-commands/1.0.1/tensorflow-speech-commands.tar.gz
      sha512sum: 1c4b41d9f719454f4155b4540e8aec0b1c5bfd555974f4e9e2253dd4e52b395ecc239f26a16a86ef047d1aac78a9ac1fe41f9c559a1d3478d951e4f078bb0fb8
      license: CC-BY-4.0
      estimated_size: 1.49G
      description: "TensorFlow Speech Command dataset is a set of one-second .wav audio files, each containing a single spoken English word. These words are from a small set of commands, and are spoken by a variety of different speakers. 20 of the words are core words, while 10 words are auxiliary words that could act as tests for algorithms in ignoring speeches that do not contain triggers. Included along with the 30 words is a collection of background noise audio files. The dataset was originally designed for limited vocabulary speech recognition tasks. The audio clips were originally collected by Google, and recorded by volunteers in uncontrolled locations around the world."
      subdatasets:
        "one":
          name: "one"
          description: "one"
          format:
            id: audio/wav
          path:
            type: regex
            value: "TensorFlow-Speech-Commands/one/.*\\.wav"
        "yes":
          name: "yes"
          description: "yes"
          format:
            id: audio/wav
          path:
            type: regex
            value: "TensorFlow-Speech-Commands/yes/.*\\.wav"
        "five":
          name: "five"
          description: "five"
          format:
            id: audio/wav
          path:
            type: regex
            value: "TensorFlow-Speech-Commands/five/.*\\.wav"
        "zero":
          name: "zero"
          description: "zero"
          format:
            id: audio/wav
          path:
            type: regex
            value: "TensorFlow-Speech-Commands/zero/.*\\.wav"
        "happy":
          name: "happy"
          description: "happy"
          format:
            id: audio/wav
          path:
            type: regex
            value: "TensorFlow-Speech-Commands/happy/.*\\.wav"
        "down":
          name: "down"
          description: "down"
          format:
            id: audio/wav
          path:
            type: regex
            value: "TensorFlow-Speech-Commands/down/.*\\.wav"
        "four":
          name: "four"
          description: "four"
          format:
            id: audio/wav
          path:
            type: regex
            value: "TensorFlow-Speech-Commands/four/.*\\.wav"
        "dog":
          name: "dog"
          description: "dog"
          format:
            id: audio/wav
          path:
            type: regex
            value: "TensorFlow-Speech-Commands/dog/.*\\.wav"
        "stop":
          name: "stop"
          description: "stop"
          format:
            id: audio/wav
          path:
            type: regex
            value: "TensorFlow-Speech-Commands/stop/.*\\.wav"
        "wow":
          name: "wow"
          description: "wow"
          format:
            id: audio/wav
          path:
            type: regex
            value: "TensorFlow-Speech-Commands/wow/.*\\.wav"
        "no":
          name: "no"
          description: "no"
          format:
            id: audio/wav
          path:
            type: regex
            value: "TensorFlow-Speech-Commands/no/.*\\.wav"
        "house":
          name: "house"
          description: "house"
          format:
            id: audio/wav
          path:
            type: regex
            value: "TensorFlow-Speech-Commands/house/.*\\.wav"
        "three":
          name: "three"
          description: "three"
          format:
            id: audio/wav
          path:
            type: regex
            value: "TensorFlow-Speech-Commands/three/.*\\.wav"
        "bird":
          name: "bird"
          description: "bird"
          format:
            id: audio/wav
          path:
            type: regex
            value: "TensorFlow-Speech-Commands/bird/.*\\.wav"
        "eight":
          name: "eight"
          description: "eight"
          format:
            id: audio/wav
          path:
            type: regex
            value: "TensorFlow-Speech-Commands/eight/.*\\.wav"
        "two":
          name: "two"
          description: "two"
          format:
            id: audio/wav
          path:
            type: regex
            value: "TensorFlow-Speech-Commands/two/.*\\.wav"
        "cat":
          name: "cat"
          description: "cat"
          format:
            id: audio/wav
          path:
            type: regex
            value: "TensorFlow-Speech-Commands/cat/.*\\.wav"
        "off":
          name: "off"
          description: "off"
          format:
            id: audio/wav
          path:
            type: regex
            value: "TensorFlow-Speech-Commands/off/.*\\.wav"
        "sheila":
          name: "sheila"
          description: "sheila"
          format:
            id: audio/wav
          path:
            type: regex
            value: "TensorFlow-Speech-Commands/sheila/.*\\.wav"
        "six":
          name: "six"
          description: "six"
          format:
            id: audio/wav
          path:
            type: regex
            value: "TensorFlow-Speech-Commands/six/.*\\.wav"
        "tree":
          name: "tree"
          description: "tree"
          format:
            id: audio/wav
          path:
            type: regex
            value: "TensorFlow-Speech-Commands/tree/.*\\.wav"
        "nine":
          name: "nine"
          description: "nine"
          format:
            id: audio/wav
          path:
            type: regex
            value: "TensorFlow-Speech-Commands/nine/.*\\.wav"
        "up":
          name: "up"
          description: "up"
          format:
            id: audio/wav
          path:
            type: regex
            value: "TensorFlow-Speech-Commands/up/.*\\.wav"
        "marvin":
          name: "marvin"
          description: "marvin"
          format:
            id: audio/wav
          path:
            type: regex
            value: "TensorFlow-Speech-Commands/marvin/.*\\.wav"
        "seven":
          name: "seven"
          description: "seven"
          format:
            id: audio/wav
          path:
            type: regex
            value: "TensorFlow-Speech-Commands/seven/.*\\.wav"
        "right":
          name: "right"
          description: "right"
          format:
            id: audio/wav
          path:
            type: regex
            value: "TensorFlow-Speech-Commands/right/.*\\.wav"
        "on":
          name: "on"
          description: "on"
          format:
            id: audio/wav
          path:
            type: regex
            value: "TensorFlow-Speech-Commands/on/.*\\.wav"
        "go":
          name: "go"
          description: "go"
          format:
            id: audio/wav
          path:
            type: regex
            value: "TensorFlow-Speech-Commands/go/.*\\.wav"
        "_background_noise_":
          name: "_background_noise_"
          description: "_background_noise_"
          format:
            id: audio/wav
          path:
            type: regex
            value: "TensorFlow-Speech-Commands/_background_noise_/.*\\.wav"
        "left":
          name: "left"
          description: "left"
          format:
            id: audio/wav
          path:
            type: regex
            value: "TensorFlow-Speech-Commands/left/.*\\.wav"
        "TensorFlow-Speech-Commands":
          name: "TensorFlow-Speech-Commands"
          description: "TensorFlow-Speech-Commands"
          format:
            id: audio/wav
          path:
            type: regex
            value: "TensorFlow-Speech-Commands/TensorFlow-Speech-Commands/.*\\.wav"
        "bed":
          name: "bed"
          description: "bed"
          format:
            id: audio/wav
          path:
            type: regex
            value: "TensorFlow-Speech-Commands/bed/.*\\.wav"

  thematic_clustering_of_sentences:
    "1.0.2":
      name: IBM Debater® Thematic Clustering of Sentences
      published: 2019-08-01
      homepage: https://developer.ibm.com/exchanges/data/all/thematic-clustering-of-sentences/
      download_url: https://dax-cdn.cdn.appdomain.cloud/dax-thematic-clustering-of-sentences/1.0.2/thematic-clustering-of-sentences.tar.gz
      sha512sum: 08a3f1a9dc06083eb51874e90d7241f67b676af2cbc28fe6a312694051f53391fc95de70fdcdce404de3578fa389558220ea38d34f70265ed88220d0b14f1aba
      license: CC-BY-SA-3.0
      estimated_size: 10.6M
      description: "A benchmark of sentence-clustering based on the partition of Wikipedia articles into sections."
      subdatasets:
        full:
          name: IBM Debater® Thematic Clustering of Sentences
          description: IBM Debater® Thematic Clustering of Sentences complete dataset
          format:
            id: table/csv
            options:
              columns:
                article_title: 'string'
                sentence: 'string'
                cluster_title: 'string'
                article_link: 'string'
          path: dataset.csv
  wikipedia_category_stance:
    "1.0.2":
      name: IBM Debater® Wikipedia Category Stance
      published: 2019-08-01
      homepage: https://developer.ibm.com/exchanges/data/all/wikipedia-category-stance/
      download_url: https://dax-cdn.cdn.appdomain.cloud/dax-wikipedia-category-stance/1.0.2/wikipedia-category-stance.tar.gz
      sha512sum: a8e056737699a5a52ef5b7bd91f13f9a672e091b7636010cdcc0ba9d34c322dd761c50bdda01fe988ed6650470f9410ef9f8789e54c4d90124606f7a889f57b4
      license: CC-BY-3.0
      estimated_size: 525K
      description: "Wikipedia categories and lists annotated for stance (Pro/Con) towards the concepts"
      subdatasets:
        full:
          name: IBM Debater® Wikipedia Category Stance Dataset
          description: IBM Debater® Wikipedia Category Stance complete dataset
          format:
            id: table/csv
            options:
              columns:
                Label: 'string'
                Concept: 'string'
                Category: 'string'
                URL: 'string'
          path: WikipediaCategoriesResults.csv
  wikipedia_oriented_relatedness:
    "1.0.2":
      name: IBM Debater® Wikipedia Oriented Relatedness
      published: 2019-08-01
      homepage: https://developer.ibm.com/exchanges/data/all/wikipedia-oriented-relatedness/
      download_url: https://dax-cdn.cdn.appdomain.cloud/dax-wikipedia-oriented-relatedness/1.0.2/wikipedia-oriented-relatedness.tar.gz
      sha512sum: 270f0bb4711acff8f88e9ce10403a72ce13b8ad21205f61d41c17c520b1d30bccefcd2312db92c80ec525a8e8a926b2ff77f58d1e95efb0b17b945c8f5de4e7f
      license: CC-BY-SA-3.0
      estimated_size: 3.4M
      description: "Pairs of concepts from Wikipedia scored for their level of relatedness."
      subdatasets:
        full:
          name: IBM Debater® Wikipedia Oriented Relatedness complete dataset
          format:
            id: table/csv
            options:
              encoding: 'ISO-8859-1'
              columns:
                source_article_uri: 'string'
                concept_1: 'string'
                concept_2: 'string'
                score: 'float'
                concept_1_uri: 'string'
                concept_2_uri: 'string'
                train_test: 'string'
          path: WORD.csv
  wikitext103:
    "1.0.1":
      name: WikiText-103
      published: 2020-03-17
      homepage: https://developer.ibm.com/exchanges/data/all/wikitext-103/
      download_url: https://dax-cdn.cdn.appdomain.cloud/dax-wikitext-103/1.0.1/wikitext-103.tar.gz
      sha512sum: c8186919aa1840af6b734ea41abc580574ea8efe2fafda220f5d01002464d17566d84be5199b875136c9593f0e0678fb5d7c84bb2231de8b4151cb9c83fa2109
      license: CC-BY-3.0
      estimated_size: 181M
      description: "The WikiText-103 dataset is a collection of over 100 million tokens extracted from the set of verified ‘Good’ and ‘Featured’ articles on Wikipedia."
      subdatasets:
        train:
          name: Train Tokens
          description: Tokens in the training subset
          format: text/plain
          path: wikitext-103/wiki.train.tokens
        valid:
          name: Validation Tokens
          description: Tokens in the validation subset
          format: text/plain
          path: wikitext-103/wiki.valid.tokens
        test:
          name: Test Tokens
          description: Tokens in the testing subset
          format: text/plain
          path: wikitext-103/wiki.test.tokens