-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdatasets.yaml
671 lines (669 loc) · 25.3 KB
/
datasets.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
#
# Copyright 2020 IBM Corp. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
api_name: com.ibm.pardata.v1
last_updated: 2021-11-30
datasets:
airline:
"1.0.0":
name: Airline Reporting Carrier On-Time Performance datasets
published: 2020-06-25
homepage: https://developer.ibm.com/exchanges/data/all/airline/
download_url: https://dax-cdn.cdn.appdomain.cloud/dax-airline/1.0.1/lax_to_jfk.tar.gz
sha512sum: 030d2e2d74a4dca03f5f471563226f68b9994ffb1cff2aaf9a50d39695f8c49ea477e87cf4981a09c7fdba8e1967b119e6f6bf52113d9cc1c09856c5e3cef064
license: CDLA-Sharing-1.0
estimated_size: 58K
description: "Information about flights from LAX to JFK that were reported to the US Bureau of Transportation Statistics."
subdatasets:
lax_to_jfk:
name: LAX to JFK
description: "LAX to JFK flights sample dataset."
format:
id: table/csv
options:
columns:
FlightDate: 'datetime'
path: lax_to_jfk/lax_to_jfk.csv
claim_sentences_search:
"1.0.2":
name: IBM Debater® Claim Sentences Search
published: 2019-08-19
homepage: https://developer.ibm.com/exchanges/data/all/claim-sentences-search/
download_url: https://dax-cdn.cdn.appdomain.cloud/dax-claim-sentences-search/1.0.2/claim-sentences-search.tar.gz
sha512sum: ca075c45338ab261d11d60bcd11bd52041f68078a3be617171caaf6291c24a72535500706be7be3b4b1dea5d93f7e1d739e4393de0fdeba7eacd019e9f70171b
license: CC-BY-SA-3.0
estimated_size: 571M
description: "Sentences from Wikipedia together with their topic."
subdatasets:
train:
name: MC Query Train
description: Sentences retrieved by the q_mc query on 70 train topics
format:
id: table/csv
options:
columns:
id: 'string'
topic: 'string'
mc: 'string'
sentence: 'string'
suffix: 'string'
prefix: 'string'
url: 'string'
path: q_mc_train.csv
heldout:
name: MC Query Heldout
description: Sentences retrieved by the q_mc query on 30 heldout topics
format:
id: table/csv
options:
columns:
id: 'string'
topic: 'string'
mc: 'string'
sentence: 'string'
suffix: 'string'
prefix: 'string'
url: 'string'
path: q_mc_heldout.csv
test:
name: MC Query Test
description: Sentences retrieved by the q_mc query on 50 test topics
format:
id: table/csv
options:
columns:
id: 'string'
topic: 'string'
mc: 'string'
sentence: 'string'
suffix: 'string'
prefix: 'string'
url: 'string'
path: q_mc_test.csv
test_set:
name: DNN Test
description: Top predictions of our system along with their labels
format:
id: table/csv
options:
columns:
id: 'string'
topic: 'string'
mc: 'string'
sentence: 'string'
query_pattern: 'string'
score: 'float'
label: 'boolean'
url: 'string'
path: test_set.csv
concept_abstractness:
"1.0.2":
name: IBM Debater® Concept Abstractness
published: 2019-07-29
homepage: https://developer.ibm.com/exchanges/data/all/concept-abstractness/
download_url: https://dax-cdn.cdn.appdomain.cloud/dax-concept-abstractness/1.0.2/concept-abstractness.tar.gz
sha512sum: 25cb76c0a8fdfc9cae7e050d4c2492bf055f97a20fa85690b9aaf7dcf965a705fc89e32aed7fbb6d418432e5368cb06fc3bb0f1ab85807fec8aef9df3965cc06
license: CC-BY-SA-3.0
estimated_size: 3.6M
description: "A set of concepts from Wikipedia rated for their degree of abstractness."
subdatasets:
unigrams:
name: Unigrams
description: "Concepts and abstractness scores for unigrams (single worded concepts)"
format: table/csv
options:
columns:
Concept: 'string'
Score: 'float'
path: prediction_unigrams.csv
bigrams:
name: Bigrams
description: "Concepts and abstractness scores for bigrams (two word concepts)"
format: table/csv
options:
columns:
Concept: 'string'
Score: 'float'
path: prediction_bigrams.csv
trigrams:
name: Trigrams
description: "Concepts and abstractness scores for trigrams (three word concepts)"
format: table/csv
options:
columns:
Concept: 'string'
Score: 'float'
path: prediction_trigrams.csv
covid19_questions:
"1.0.0":
name: COVID-19 Questions
published: 2020-10-01
homepage: https://developer.ibm.com/exchanges/data/all/cqa/
download_url: https://dax-cdn.cdn.appdomain.cloud/dax-covid-19-questions/1.0.0/covid-19-questions.tar.gz
sha512sum: f0cd0689b51277e14361dcd110ad6268fd88c8bdd9d6254f15953e539d0f847bbfd4981d1cc5cf0045fb1c159938e30d0b4e7ab68c20ccec5ee7db6ff3622b41
license: CDLA-Sharing-1.0
estimated_size: 49K
description: "Categorized questions which were frequently asked by the public during the COVID-19 pandemic period."
subdatasets:
full:
name: COVID-19 Questions
description: "Full dataset."
format:
id: table/csv
options:
columns:
label: 'string'
utterance: 'string'
delimiter: "\t"
path: covid-19-questions/covid_19_questions.tsv
expert_in_the_loop_ai_polymer_discovery:
"1.0.0":
name: Expert in the Loop AI – Polymer Discovery
published: 2020-06-15
homepage: https://developer.ibm.com/exchanges/data/all/expert-in-the-loop-ai-polymer-discovery/
download_url: https://dax-cdn.cdn.appdomain.cloud/dax-expert-in-the-loop-ai-polymer-discovery/1.0.0/expert-in-the-loop-ai-polymer-discovery.tar.gz
sha512sum: a87db5c8a30288fd2dc41c1189125378c0700035f7c3a2ca6c71bfbc4b955c15f2bb4f955d2b4a3e95b06ee3719fbd1082c79b299cf688da7af6baedb333de4c
license: CDLA-Sharing-1.0
estimated_size: 14M
description: "Candidate monomers for ring-opening polymerization adjudicated/labelled for workability."
subdatasets:
full:
name: Expert in the Loop AI Full Dataset
description: Full version of the Expert in the Loop AI dataset.
format:
id: table/csv
options:
columns:
Sequence: 'string'
Adjudication: 'string'
path: expert-in-the-loop-ai-polymer-discovery/EITLAI_CyclicLactones_ROP_v1.csv
fashion_mnist:
"1.0.0":
name: Fashion-MNIST
published: 2019-08-22
homepage: https://developer.ibm.com/exchanges/data/all/fashion-mnist/
download_url: https://dax-cdn.cdn.appdomain.cloud/dax-fashion-mnist/1.0.2/fashion-mnist.tar.gz
sha512sum: dd89a49011582fd9c6825a74a8af0a4d72cdb8a8a3bc9e300c7f0bf92cf83a251af58ef24cf66ec5047c84c17f1d607bb3c648a6dbb89e1d4935a654ea6befc6
license: MIT
estimated_size: 37M
description: "The Fashion-MNIST dataset contains 60,000 training images (and 10,000 test images) of fashion and clothing items, taken from 10 classes. Each image is a standardized 28×28 size in grayscale (784 total pixels). Fashion-MNIST was created by Zalando as a compatible replacement for the original MNIST dataset of handwritten digits."
subdatasets:
train:
name: Training dataset
description: Training dataset in Fashion-MNIST.
format:
id: table/csv
path: fashion-mnist_train.csv
test:
name: Testing dataset
description: Testing dataset in Fashion-MNIST.
format:
id: table/csv
path: fashion-mnist_test.csv
gmb:
"1.0.2":
name: Groningen Meaning Bank Modified
published: 2019-12-19
homepage: https://developer.ibm.com/exchanges/data/all/groningen-meaning-bank/
download_url: https://dax-cdn.cdn.appdomain.cloud/dax-groningen-meaning-bank-modified/1.0.2/groningen-meaning-bank-modified.tar.gz
sha512sum: 4b0e6c445bf5be0573ae411f8e0ba307b884300ab6b5473ea0d455dd82b8cf4dc06fb77a9a606850f3b283357f22fd516e91850cea7e45de19ce5625fda2c001
license: CDLA-Sharing-1.0
estimated_size: 10M
description: "A dataset of multi-sentence texts, together with annotations for parts-of-speech, named entities, lexical categories and other natural language structural phenomena."
subdatasets:
gmb_subset_full:
name: GMB Subset Full
description: A full version of the raw dataset. Used to train MAX model – Named Entity Tagger.
format: text/plain
path: groningen_meaning_bank_modified/gmb_subset_full.txt
noaa_jfk:
"1.1.4":
name: NOAA Weather Data – JFK Airport
published: 2019-09-12
homepage: https://developer.ibm.com/exchanges/data/all/jfk-weather-data/
download_url: https://dax-cdn.cdn.appdomain.cloud/dax-noaa-weather-data-jfk-airport/1.1.4/noaa-weather-data-jfk-airport.tar.gz
sha512sum: e3f27a8fcc0db5289df356e3f48aef6df56236798d5b3ae3889d358489ec6609d2d797e4c4932b86016d2ce4a379ac0a0749b6fb2c293ebae4e585ea1c8422ac
license: CDLA-Sharing-1.0
estimated_size: 3.2M
description: "The NOAA JFK dataset contains 114,546 hourly observations of various local climatological variables (including visibility, temperature, wind speed and direction, humidity, dew point, and pressure). The data was collected by a NOAA weather station located at the John F. Kennedy International Airport in Queens, New York."
subdatasets:
jfk_weather_cleaned:
name: Cleaned JFK Weather Data
description: Cleaned version of the JFK weather data.
format:
id: table/csv
options:
columns:
DATE: 'datetime'
path: noaa-weather-data-jfk-airport/jfk_weather_cleaned.csv
taranaki_basin_curated_well_logs:
"1.0.0":
name: Taranaki Basin Curated Well Logs
published: 2020-05-10
homepage: https://developer.ibm.com/exchanges/data/all/taranaki-basin-curated-well-logs/
download_url: https://dax-cdn.cdn.appdomain.cloud/dax-taranaki-basin-curated-well-logs/1.0.0/taranaki-basin-curated-well-logs.tar.gz
sha512sum: 6f1a8e051806b934e650dca7f69c58505635303ea3a47bfa36da175056669e5a09057fac8895d4f01db74a547c63ce6c6b8959854f10d6265cda71c377d06183
license: CDLA-Sharing-1.0
estimated_size: 873M
description: "A dataset containing well logs curated from the 2016 New Zealand Petroleum Exploration public data pack."
subdatasets:
logs:
name: Taranaki Basin Curated Well Logs
description: Log information
format:
id: table/csv
options:
columns:
BS: 'float'
CALI: 'float'
DENS: 'float'
DRHO: 'float'
DTC: 'float'
GR: 'float'
NEUT: 'float'
PEF: 'float'
RESD: 'float'
RESM: 'float'
RESS: 'float'
SP: 'float'
TEMP: 'float'
TENS: 'float'
X: 'float'
Y: 'float'
Z: 'float'
DEPT: 'float'
ONSHORE: 'boolean'
DIRSURVEY: 'boolean'
WELLNAME: 'string'
FORMATION: 'string'
path: taranaki-basin-curated-well-logs/logs.csv
coordinates:
name: Taranaki Basin Curated Well Coordinates
description: Coordinates of the wells
format:
id: table/csv
options:
columns:
WELL_NAME: 'string'
LAT: 'float'
LONG: 'float'
X: 'float'
Y: 'float'
path: taranaki-basin-curated-well-logs/coords.csv
tensorflow_speech_commands:
"1.0.0":
name: TensorFlow Speech Commands
published: 2020-03-17
homepage: https://developer.ibm.com/exchanges/data/all/speech-commands/
download_url: https://dax-cdn.cdn.appdomain.cloud/dax-tensorflow-speech-commands/1.0.1/tensorflow-speech-commands.tar.gz
sha512sum: 1c4b41d9f719454f4155b4540e8aec0b1c5bfd555974f4e9e2253dd4e52b395ecc239f26a16a86ef047d1aac78a9ac1fe41f9c559a1d3478d951e4f078bb0fb8
license: CC-BY-4.0
estimated_size: 1.49G
description: "TensorFlow Speech Command dataset is a set of one-second .wav audio files, each containing a single spoken English word. These words are from a small set of commands, and are spoken by a variety of different speakers. 20 of the words are core words, while 10 words are auxiliary words that could act as tests for algorithms in ignoring speeches that do not contain triggers. Included along with the 30 words is a collection of background noise audio files. The dataset was originally designed for limited vocabulary speech recognition tasks. The audio clips were originally collected by Google, and recorded by volunteers in uncontrolled locations around the world."
subdatasets:
"one":
name: "one"
description: "one"
format:
id: audio/wav
path:
type: regex
value: "TensorFlow-Speech-Commands/one/.*\\.wav"
"yes":
name: "yes"
description: "yes"
format:
id: audio/wav
path:
type: regex
value: "TensorFlow-Speech-Commands/yes/.*\\.wav"
"five":
name: "five"
description: "five"
format:
id: audio/wav
path:
type: regex
value: "TensorFlow-Speech-Commands/five/.*\\.wav"
"zero":
name: "zero"
description: "zero"
format:
id: audio/wav
path:
type: regex
value: "TensorFlow-Speech-Commands/zero/.*\\.wav"
"happy":
name: "happy"
description: "happy"
format:
id: audio/wav
path:
type: regex
value: "TensorFlow-Speech-Commands/happy/.*\\.wav"
"down":
name: "down"
description: "down"
format:
id: audio/wav
path:
type: regex
value: "TensorFlow-Speech-Commands/down/.*\\.wav"
"four":
name: "four"
description: "four"
format:
id: audio/wav
path:
type: regex
value: "TensorFlow-Speech-Commands/four/.*\\.wav"
"dog":
name: "dog"
description: "dog"
format:
id: audio/wav
path:
type: regex
value: "TensorFlow-Speech-Commands/dog/.*\\.wav"
"stop":
name: "stop"
description: "stop"
format:
id: audio/wav
path:
type: regex
value: "TensorFlow-Speech-Commands/stop/.*\\.wav"
"wow":
name: "wow"
description: "wow"
format:
id: audio/wav
path:
type: regex
value: "TensorFlow-Speech-Commands/wow/.*\\.wav"
"no":
name: "no"
description: "no"
format:
id: audio/wav
path:
type: regex
value: "TensorFlow-Speech-Commands/no/.*\\.wav"
"house":
name: "house"
description: "house"
format:
id: audio/wav
path:
type: regex
value: "TensorFlow-Speech-Commands/house/.*\\.wav"
"three":
name: "three"
description: "three"
format:
id: audio/wav
path:
type: regex
value: "TensorFlow-Speech-Commands/three/.*\\.wav"
"bird":
name: "bird"
description: "bird"
format:
id: audio/wav
path:
type: regex
value: "TensorFlow-Speech-Commands/bird/.*\\.wav"
"eight":
name: "eight"
description: "eight"
format:
id: audio/wav
path:
type: regex
value: "TensorFlow-Speech-Commands/eight/.*\\.wav"
"two":
name: "two"
description: "two"
format:
id: audio/wav
path:
type: regex
value: "TensorFlow-Speech-Commands/two/.*\\.wav"
"cat":
name: "cat"
description: "cat"
format:
id: audio/wav
path:
type: regex
value: "TensorFlow-Speech-Commands/cat/.*\\.wav"
"off":
name: "off"
description: "off"
format:
id: audio/wav
path:
type: regex
value: "TensorFlow-Speech-Commands/off/.*\\.wav"
"sheila":
name: "sheila"
description: "sheila"
format:
id: audio/wav
path:
type: regex
value: "TensorFlow-Speech-Commands/sheila/.*\\.wav"
"six":
name: "six"
description: "six"
format:
id: audio/wav
path:
type: regex
value: "TensorFlow-Speech-Commands/six/.*\\.wav"
"tree":
name: "tree"
description: "tree"
format:
id: audio/wav
path:
type: regex
value: "TensorFlow-Speech-Commands/tree/.*\\.wav"
"nine":
name: "nine"
description: "nine"
format:
id: audio/wav
path:
type: regex
value: "TensorFlow-Speech-Commands/nine/.*\\.wav"
"up":
name: "up"
description: "up"
format:
id: audio/wav
path:
type: regex
value: "TensorFlow-Speech-Commands/up/.*\\.wav"
"marvin":
name: "marvin"
description: "marvin"
format:
id: audio/wav
path:
type: regex
value: "TensorFlow-Speech-Commands/marvin/.*\\.wav"
"seven":
name: "seven"
description: "seven"
format:
id: audio/wav
path:
type: regex
value: "TensorFlow-Speech-Commands/seven/.*\\.wav"
"right":
name: "right"
description: "right"
format:
id: audio/wav
path:
type: regex
value: "TensorFlow-Speech-Commands/right/.*\\.wav"
"on":
name: "on"
description: "on"
format:
id: audio/wav
path:
type: regex
value: "TensorFlow-Speech-Commands/on/.*\\.wav"
"go":
name: "go"
description: "go"
format:
id: audio/wav
path:
type: regex
value: "TensorFlow-Speech-Commands/go/.*\\.wav"
"_background_noise_":
name: "_background_noise_"
description: "_background_noise_"
format:
id: audio/wav
path:
type: regex
value: "TensorFlow-Speech-Commands/_background_noise_/.*\\.wav"
"left":
name: "left"
description: "left"
format:
id: audio/wav
path:
type: regex
value: "TensorFlow-Speech-Commands/left/.*\\.wav"
"TensorFlow-Speech-Commands":
name: "TensorFlow-Speech-Commands"
description: "TensorFlow-Speech-Commands"
format:
id: audio/wav
path:
type: regex
value: "TensorFlow-Speech-Commands/TensorFlow-Speech-Commands/.*\\.wav"
"bed":
name: "bed"
description: "bed"
format:
id: audio/wav
path:
type: regex
value: "TensorFlow-Speech-Commands/bed/.*\\.wav"
thematic_clustering_of_sentences:
"1.0.2":
name: IBM Debater® Thematic Clustering of Sentences
published: 2019-08-01
homepage: https://developer.ibm.com/exchanges/data/all/thematic-clustering-of-sentences/
download_url: https://dax-cdn.cdn.appdomain.cloud/dax-thematic-clustering-of-sentences/1.0.2/thematic-clustering-of-sentences.tar.gz
sha512sum: 08a3f1a9dc06083eb51874e90d7241f67b676af2cbc28fe6a312694051f53391fc95de70fdcdce404de3578fa389558220ea38d34f70265ed88220d0b14f1aba
license: CC-BY-SA-3.0
estimated_size: 10.6M
description: "A benchmark of sentence-clustering based on the partition of Wikipedia articles into sections."
subdatasets:
full:
name: IBM Debater® Thematic Clustering of Sentences
description: IBM Debater® Thematic Clustering of Sentences complete dataset
format:
id: table/csv
options:
columns:
article_title: 'string'
sentence: 'string'
cluster_title: 'string'
article_link: 'string'
path: dataset.csv
wikipedia_category_stance:
"1.0.2":
name: IBM Debater® Wikipedia Category Stance
published: 2019-08-01
homepage: https://developer.ibm.com/exchanges/data/all/wikipedia-category-stance/
download_url: https://dax-cdn.cdn.appdomain.cloud/dax-wikipedia-category-stance/1.0.2/wikipedia-category-stance.tar.gz
sha512sum: a8e056737699a5a52ef5b7bd91f13f9a672e091b7636010cdcc0ba9d34c322dd761c50bdda01fe988ed6650470f9410ef9f8789e54c4d90124606f7a889f57b4
license: CC-BY-3.0
estimated_size: 525K
description: "Wikipedia categories and lists annotated for stance (Pro/Con) towards the concepts"
subdatasets:
full:
name: IBM Debater® Wikipedia Category Stance Dataset
description: IBM Debater® Wikipedia Category Stance complete dataset
format:
id: table/csv
options:
columns:
Label: 'string'
Concept: 'string'
Category: 'string'
URL: 'string'
path: WikipediaCategoriesResults.csv
wikipedia_oriented_relatedness:
"1.0.2":
name: IBM Debater® Wikipedia Oriented Relatedness
published: 2019-08-01
homepage: https://developer.ibm.com/exchanges/data/all/wikipedia-oriented-relatedness/
download_url: https://dax-cdn.cdn.appdomain.cloud/dax-wikipedia-oriented-relatedness/1.0.2/wikipedia-oriented-relatedness.tar.gz
sha512sum: 270f0bb4711acff8f88e9ce10403a72ce13b8ad21205f61d41c17c520b1d30bccefcd2312db92c80ec525a8e8a926b2ff77f58d1e95efb0b17b945c8f5de4e7f
license: CC-BY-SA-3.0
estimated_size: 3.4M
description: "Pairs of concepts from Wikipedia scored for their level of relatedness."
subdatasets:
full:
name: IBM Debater® Wikipedia Oriented Relatedness complete dataset
format:
id: table/csv
options:
encoding: 'ISO-8859-1'
columns:
source_article_uri: 'string'
concept_1: 'string'
concept_2: 'string'
score: 'float'
concept_1_uri: 'string'
concept_2_uri: 'string'
train_test: 'string'
path: WORD.csv
wikitext103:
"1.0.1":
name: WikiText-103
published: 2020-03-17
homepage: https://developer.ibm.com/exchanges/data/all/wikitext-103/
download_url: https://dax-cdn.cdn.appdomain.cloud/dax-wikitext-103/1.0.1/wikitext-103.tar.gz
sha512sum: c8186919aa1840af6b734ea41abc580574ea8efe2fafda220f5d01002464d17566d84be5199b875136c9593f0e0678fb5d7c84bb2231de8b4151cb9c83fa2109
license: CC-BY-3.0
estimated_size: 181M
description: "The WikiText-103 dataset is a collection of over 100 million tokens extracted from the set of verified ‘Good’ and ‘Featured’ articles on Wikipedia."
subdatasets:
train:
name: Train Tokens
description: Tokens in the training subset
format: text/plain
path: wikitext-103/wiki.train.tokens
valid:
name: Validation Tokens
description: Tokens in the validation subset
format: text/plain
path: wikitext-103/wiki.valid.tokens
test:
name: Test Tokens
description: Tokens in the testing subset
format: text/plain
path: wikitext-103/wiki.test.tokens