From 2bb7dbbb8e77757d97c2fd0ce55b818cf77e111a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 18 Jul 2024 14:39:41 +0000 Subject: [PATCH] Deployed 73959ef with MkDocs version: 1.6.0 --- .nojekyll | 0 404.html | 584 ++ add-your-own-data/index.html | 857 +++ api/base_dataset/index.html | 3901 ++++++++++ api/config/index.html | 980 +++ api/hf_dataset/index.html | 992 +++ api/jsonl_dataset/index.html | 1082 +++ assets/_mkdocstrings.css | 119 + assets/images/favicon.png | Bin 0 -> 1870 bytes assets/javascripts/bundle.fe8b6f2b.min.js | 29 + assets/javascripts/bundle.fe8b6f2b.min.js.map | 7 + assets/javascripts/lunr/min/lunr.ar.min.js | 1 + assets/javascripts/lunr/min/lunr.da.min.js | 18 + assets/javascripts/lunr/min/lunr.de.min.js | 18 + assets/javascripts/lunr/min/lunr.du.min.js | 18 + assets/javascripts/lunr/min/lunr.el.min.js | 1 + assets/javascripts/lunr/min/lunr.es.min.js | 18 + assets/javascripts/lunr/min/lunr.fi.min.js | 18 + assets/javascripts/lunr/min/lunr.fr.min.js | 18 + assets/javascripts/lunr/min/lunr.he.min.js | 1 + assets/javascripts/lunr/min/lunr.hi.min.js | 1 + assets/javascripts/lunr/min/lunr.hu.min.js | 18 + assets/javascripts/lunr/min/lunr.hy.min.js | 1 + assets/javascripts/lunr/min/lunr.it.min.js | 18 + assets/javascripts/lunr/min/lunr.ja.min.js | 1 + assets/javascripts/lunr/min/lunr.jp.min.js | 1 + assets/javascripts/lunr/min/lunr.kn.min.js | 1 + assets/javascripts/lunr/min/lunr.ko.min.js | 1 + assets/javascripts/lunr/min/lunr.multi.min.js | 1 + assets/javascripts/lunr/min/lunr.nl.min.js | 18 + assets/javascripts/lunr/min/lunr.no.min.js | 18 + assets/javascripts/lunr/min/lunr.pt.min.js | 18 + assets/javascripts/lunr/min/lunr.ro.min.js | 18 + assets/javascripts/lunr/min/lunr.ru.min.js | 18 + assets/javascripts/lunr/min/lunr.sa.min.js | 1 + .../lunr/min/lunr.stemmer.support.min.js | 1 + assets/javascripts/lunr/min/lunr.sv.min.js | 18 + assets/javascripts/lunr/min/lunr.ta.min.js | 1 + assets/javascripts/lunr/min/lunr.te.min.js | 1 + assets/javascripts/lunr/min/lunr.th.min.js | 1 + assets/javascripts/lunr/min/lunr.tr.min.js | 18 + assets/javascripts/lunr/min/lunr.vi.min.js | 1 + assets/javascripts/lunr/min/lunr.zh.min.js | 1 + assets/javascripts/lunr/tinyseg.js | 206 + assets/javascripts/lunr/wordcut.js | 6708 +++++++++++++++++ .../workers/search.b8dbb3d2.min.js | 42 + .../workers/search.b8dbb3d2.min.js.map | 7 + assets/stylesheets/main.76a95c52.min.css | 1 + assets/stylesheets/main.76a95c52.min.css.map | 1 + assets/stylesheets/palette.06af60db.min.css | 1 + .../stylesheets/palette.06af60db.min.css.map | 1 + compose-train-validation-data/index.html | 632 ++ config-files/index.html | 744 ++ datasets/index.html | 1625 ++++ datasets/language_af/index.html | 1197 +++ datasets/language_am/index.html | 1197 +++ datasets/language_an/index.html | 1149 +++ datasets/language_ar/index.html | 1197 +++ datasets/language_arz/index.html | 1149 +++ datasets/language_as/index.html | 1149 +++ datasets/language_ast/index.html | 1149 +++ datasets/language_av/index.html | 1149 +++ datasets/language_az/index.html | 1149 +++ datasets/language_azb/index.html | 1149 +++ datasets/language_ba/index.html | 1149 +++ datasets/language_be/index.html | 1149 +++ datasets/language_bg/index.html | 1633 ++++ datasets/language_bh/index.html | 1149 +++ datasets/language_bn/index.html | 1149 +++ datasets/language_bo/index.html | 1149 +++ datasets/language_bpy/index.html | 1149 +++ datasets/language_br/index.html | 1149 +++ datasets/language_bs/index.html | 1149 +++ datasets/language_bxr/index.html | 1149 +++ datasets/language_ca/index.html | 1457 ++++ datasets/language_ce/index.html | 1149 +++ datasets/language_ceb/index.html | 1149 +++ datasets/language_ckb/index.html | 1149 +++ datasets/language_code/index.html | 4669 ++++++++++++ datasets/language_cs/index.html | 1545 ++++ datasets/language_cv/index.html | 1149 +++ datasets/language_cy/index.html | 1149 +++ datasets/language_da/index.html | 1545 ++++ datasets/language_de/index.html | 1589 ++++ datasets/language_dsb/index.html | 1149 +++ datasets/language_dv/index.html | 1149 +++ datasets/language_el/index.html | 1633 ++++ datasets/language_en/index.html | 2837 +++++++ datasets/language_eo/index.html | 1149 +++ datasets/language_es/index.html | 1545 ++++ datasets/language_et/index.html | 1545 ++++ datasets/language_eu/index.html | 1457 ++++ datasets/language_fa/index.html | 1149 +++ datasets/language_fi/index.html | 1545 ++++ datasets/language_fr/index.html | 1593 ++++ datasets/language_fy/index.html | 1149 +++ datasets/language_ga/index.html | 1457 ++++ datasets/language_gd/index.html | 1149 +++ datasets/language_gl/index.html | 1369 ++++ datasets/language_gn/index.html | 1149 +++ datasets/language_gom/index.html | 1149 +++ datasets/language_gsw/index.html | 1149 +++ datasets/language_gu/index.html | 1149 +++ datasets/language_ha/index.html | 669 ++ datasets/language_he/index.html | 1149 +++ datasets/language_hi/index.html | 1149 +++ datasets/language_hr/index.html | 1589 ++++ datasets/language_hsb/index.html | 1149 +++ datasets/language_ht/index.html | 1149 +++ datasets/language_hu/index.html | 1501 ++++ datasets/language_hy/index.html | 1149 +++ datasets/language_ia/index.html | 1149 +++ datasets/language_id/index.html | 1149 +++ datasets/language_ie/index.html | 1149 +++ datasets/language_ig/index.html | 669 ++ datasets/language_ilo/index.html | 1149 +++ datasets/language_io/index.html | 1149 +++ datasets/language_is/index.html | 1149 +++ datasets/language_it/index.html | 1545 ++++ datasets/language_ja/index.html | 1149 +++ datasets/language_jbo/index.html | 1149 +++ datasets/language_jv/index.html | 1149 +++ datasets/language_ka/index.html | 1149 +++ datasets/language_kk/index.html | 1149 +++ datasets/language_km/index.html | 1149 +++ datasets/language_kn/index.html | 1149 +++ datasets/language_ko/index.html | 1149 +++ datasets/language_krc/index.html | 1149 +++ datasets/language_ku/index.html | 1149 +++ datasets/language_kv/index.html | 1149 +++ datasets/language_kw/index.html | 1149 +++ datasets/language_ky/index.html | 1197 +++ datasets/language_la/index.html | 1149 +++ datasets/language_lb/index.html | 1149 +++ datasets/language_lez/index.html | 1149 +++ datasets/language_li/index.html | 1149 +++ datasets/language_lmo/index.html | 1149 +++ datasets/language_lo/index.html | 1149 +++ datasets/language_lt/index.html | 1457 ++++ datasets/language_lv/index.html | 1369 ++++ datasets/language_mai/index.html | 1149 +++ datasets/language_mg/index.html | 1149 +++ datasets/language_mhr/index.html | 1149 +++ datasets/language_min/index.html | 1149 +++ datasets/language_mk/index.html | 1149 +++ datasets/language_ml/index.html | 1149 +++ datasets/language_mn/index.html | 1149 +++ datasets/language_mr/index.html | 1149 +++ datasets/language_mrj/index.html | 1149 +++ datasets/language_ms/index.html | 1149 +++ datasets/language_mt/index.html | 1369 ++++ datasets/language_multi/index.html | 1149 +++ datasets/language_mwl/index.html | 1149 +++ datasets/language_my/index.html | 1149 +++ datasets/language_mzn/index.html | 1149 +++ datasets/language_nah/index.html | 1149 +++ datasets/language_nds/index.html | 1149 +++ datasets/language_ne/index.html | 1149 +++ datasets/language_new/index.html | 1149 +++ datasets/language_nl/index.html | 1765 +++++ datasets/language_nn/index.html | 1281 ++++ datasets/language_no/index.html | 1457 ++++ datasets/language_ny/index.html | 669 ++ datasets/language_oc/index.html | 1149 +++ datasets/language_om/index.html | 669 ++ datasets/language_or/index.html | 1149 +++ datasets/language_os/index.html | 1149 +++ datasets/language_pa/index.html | 1149 +++ datasets/language_pl/index.html | 1633 ++++ datasets/language_pms/index.html | 1149 +++ datasets/language_pnb/index.html | 1149 +++ datasets/language_ps/index.html | 1149 +++ datasets/language_pt/index.html | 1653 ++++ datasets/language_qu/index.html | 1149 +++ datasets/language_ro/index.html | 1589 ++++ datasets/language_ru/index.html | 1149 +++ datasets/language_rw/index.html | 669 ++ datasets/language_sa/index.html | 1149 +++ datasets/language_sah/index.html | 1149 +++ datasets/language_sd/index.html | 1149 +++ datasets/language_sh/index.html | 1193 +++ datasets/language_si/index.html | 1149 +++ datasets/language_sk/index.html | 1545 ++++ datasets/language_sl/index.html | 1633 ++++ datasets/language_sn/index.html | 669 ++ datasets/language_so/index.html | 1197 +++ datasets/language_sq/index.html | 1149 +++ datasets/language_sr/index.html | 1457 ++++ datasets/language_st/index.html | 669 ++ datasets/language_su/index.html | 1149 +++ datasets/language_sv/index.html | 1545 ++++ datasets/language_sw/index.html | 1197 +++ datasets/language_ta/index.html | 1149 +++ datasets/language_te/index.html | 1149 +++ datasets/language_tg/index.html | 1149 +++ datasets/language_th/index.html | 1149 +++ datasets/language_ti/index.html | 669 ++ datasets/language_tk/index.html | 1149 +++ datasets/language_tl/index.html | 1149 +++ datasets/language_tr/index.html | 1149 +++ datasets/language_tt/index.html | 1149 +++ datasets/language_ug/index.html | 1149 +++ datasets/language_uk/index.html | 1501 ++++ datasets/language_ur/index.html | 1149 +++ datasets/language_uz/index.html | 1149 +++ datasets/language_vi/index.html | 1149 +++ datasets/language_vo/index.html | 1149 +++ datasets/language_wa/index.html | 1149 +++ datasets/language_war/index.html | 1149 +++ datasets/language_wuu/index.html | 1149 +++ datasets/language_x-eml/index.html | 1149 +++ datasets/language_xal/index.html | 1149 +++ datasets/language_xh/index.html | 669 ++ datasets/language_xmf/index.html | 1149 +++ datasets/language_yi/index.html | 1149 +++ datasets/language_yo/index.html | 1197 +++ datasets/language_zh/index.html | 1149 +++ datasets/language_zu/index.html | 669 ++ datasets/tokens_by_language.png | Bin 0 -> 21391 bytes datasets/tokens_by_source.png | Bin 0 -> 41755 bytes extract-text-data/index.html | 683 ++ getting-started/index.html | 805 ++ ...a_pile_of_books__whit-removebg-preview.png | Bin 0 -> 173165 bytes images/data-schema.svg | 1 + images/favicon-16x16.png | Bin 0 -> 656 bytes images/favicon-32x32.png | Bin 0 -> 1685 bytes images/favicon.ico | Bin 0 -> 15406 bytes images/pipeline.svg | 1 + index.html | 629 ++ objects.inv | 6 + overview/index.html | 703 ++ related-work/index.html | 631 ++ search/search_index.json | 1 + sitemap.xml | 888 +++ sitemap.xml.gz | Bin 0 -> 894 bytes 235 files changed, 223710 insertions(+) create mode 100644 .nojekyll create mode 100644 404.html create mode 100644 add-your-own-data/index.html create mode 100644 api/base_dataset/index.html create mode 100644 api/config/index.html create mode 100644 api/hf_dataset/index.html create mode 100644 api/jsonl_dataset/index.html create mode 100644 assets/_mkdocstrings.css create mode 100644 assets/images/favicon.png create mode 100644 assets/javascripts/bundle.fe8b6f2b.min.js create mode 100644 assets/javascripts/bundle.fe8b6f2b.min.js.map create mode 100644 assets/javascripts/lunr/min/lunr.ar.min.js create mode 100644 assets/javascripts/lunr/min/lunr.da.min.js create mode 100644 assets/javascripts/lunr/min/lunr.de.min.js create mode 100644 assets/javascripts/lunr/min/lunr.du.min.js create mode 100644 assets/javascripts/lunr/min/lunr.el.min.js create mode 100644 assets/javascripts/lunr/min/lunr.es.min.js create mode 100644 assets/javascripts/lunr/min/lunr.fi.min.js create mode 100644 assets/javascripts/lunr/min/lunr.fr.min.js create mode 100644 assets/javascripts/lunr/min/lunr.he.min.js create mode 100644 assets/javascripts/lunr/min/lunr.hi.min.js create mode 100644 assets/javascripts/lunr/min/lunr.hu.min.js create mode 100644 assets/javascripts/lunr/min/lunr.hy.min.js create mode 100644 assets/javascripts/lunr/min/lunr.it.min.js create mode 100644 assets/javascripts/lunr/min/lunr.ja.min.js create mode 100644 assets/javascripts/lunr/min/lunr.jp.min.js create mode 100644 assets/javascripts/lunr/min/lunr.kn.min.js create mode 100644 assets/javascripts/lunr/min/lunr.ko.min.js create mode 100644 assets/javascripts/lunr/min/lunr.multi.min.js create mode 100644 assets/javascripts/lunr/min/lunr.nl.min.js create mode 100644 assets/javascripts/lunr/min/lunr.no.min.js create mode 100644 assets/javascripts/lunr/min/lunr.pt.min.js create mode 100644 assets/javascripts/lunr/min/lunr.ro.min.js create mode 100644 assets/javascripts/lunr/min/lunr.ru.min.js create mode 100644 assets/javascripts/lunr/min/lunr.sa.min.js create mode 100644 assets/javascripts/lunr/min/lunr.stemmer.support.min.js create mode 100644 assets/javascripts/lunr/min/lunr.sv.min.js create mode 100644 assets/javascripts/lunr/min/lunr.ta.min.js create mode 100644 assets/javascripts/lunr/min/lunr.te.min.js create mode 100644 assets/javascripts/lunr/min/lunr.th.min.js create mode 100644 assets/javascripts/lunr/min/lunr.tr.min.js create mode 100644 assets/javascripts/lunr/min/lunr.vi.min.js create mode 100644 assets/javascripts/lunr/min/lunr.zh.min.js create mode 100644 assets/javascripts/lunr/tinyseg.js create mode 100644 assets/javascripts/lunr/wordcut.js create mode 100644 assets/javascripts/workers/search.b8dbb3d2.min.js create mode 100644 assets/javascripts/workers/search.b8dbb3d2.min.js.map create mode 100644 assets/stylesheets/main.76a95c52.min.css create mode 100644 assets/stylesheets/main.76a95c52.min.css.map create mode 100644 assets/stylesheets/palette.06af60db.min.css create mode 100644 assets/stylesheets/palette.06af60db.min.css.map create mode 100644 compose-train-validation-data/index.html create mode 100644 config-files/index.html create mode 100644 datasets/index.html create mode 100644 datasets/language_af/index.html create mode 100644 datasets/language_am/index.html create mode 100644 datasets/language_an/index.html create mode 100644 datasets/language_ar/index.html create mode 100644 datasets/language_arz/index.html create mode 100644 datasets/language_as/index.html create mode 100644 datasets/language_ast/index.html create mode 100644 datasets/language_av/index.html create mode 100644 datasets/language_az/index.html create mode 100644 datasets/language_azb/index.html create mode 100644 datasets/language_ba/index.html create mode 100644 datasets/language_be/index.html create mode 100644 datasets/language_bg/index.html create mode 100644 datasets/language_bh/index.html create mode 100644 datasets/language_bn/index.html create mode 100644 datasets/language_bo/index.html create mode 100644 datasets/language_bpy/index.html create mode 100644 datasets/language_br/index.html create mode 100644 datasets/language_bs/index.html create mode 100644 datasets/language_bxr/index.html create mode 100644 datasets/language_ca/index.html create mode 100644 datasets/language_ce/index.html create mode 100644 datasets/language_ceb/index.html create mode 100644 datasets/language_ckb/index.html create mode 100644 datasets/language_code/index.html create mode 100644 datasets/language_cs/index.html create mode 100644 datasets/language_cv/index.html create mode 100644 datasets/language_cy/index.html create mode 100644 datasets/language_da/index.html create mode 100644 datasets/language_de/index.html create mode 100644 datasets/language_dsb/index.html create mode 100644 datasets/language_dv/index.html create mode 100644 datasets/language_el/index.html create mode 100644 datasets/language_en/index.html create mode 100644 datasets/language_eo/index.html create mode 100644 datasets/language_es/index.html create mode 100644 datasets/language_et/index.html create mode 100644 datasets/language_eu/index.html create mode 100644 datasets/language_fa/index.html create mode 100644 datasets/language_fi/index.html create mode 100644 datasets/language_fr/index.html create mode 100644 datasets/language_fy/index.html create mode 100644 datasets/language_ga/index.html create mode 100644 datasets/language_gd/index.html create mode 100644 datasets/language_gl/index.html create mode 100644 datasets/language_gn/index.html create mode 100644 datasets/language_gom/index.html create mode 100644 datasets/language_gsw/index.html create mode 100644 datasets/language_gu/index.html create mode 100644 datasets/language_ha/index.html create mode 100644 datasets/language_he/index.html create mode 100644 datasets/language_hi/index.html create mode 100644 datasets/language_hr/index.html create mode 100644 datasets/language_hsb/index.html create mode 100644 datasets/language_ht/index.html create mode 100644 datasets/language_hu/index.html create mode 100644 datasets/language_hy/index.html create mode 100644 datasets/language_ia/index.html create mode 100644 datasets/language_id/index.html create mode 100644 datasets/language_ie/index.html create mode 100644 datasets/language_ig/index.html create mode 100644 datasets/language_ilo/index.html create mode 100644 datasets/language_io/index.html create mode 100644 datasets/language_is/index.html create mode 100644 datasets/language_it/index.html create mode 100644 datasets/language_ja/index.html create mode 100644 datasets/language_jbo/index.html create mode 100644 datasets/language_jv/index.html create mode 100644 datasets/language_ka/index.html create mode 100644 datasets/language_kk/index.html create mode 100644 datasets/language_km/index.html create mode 100644 datasets/language_kn/index.html create mode 100644 datasets/language_ko/index.html create mode 100644 datasets/language_krc/index.html create mode 100644 datasets/language_ku/index.html create mode 100644 datasets/language_kv/index.html create mode 100644 datasets/language_kw/index.html create mode 100644 datasets/language_ky/index.html create mode 100644 datasets/language_la/index.html create mode 100644 datasets/language_lb/index.html create mode 100644 datasets/language_lez/index.html create mode 100644 datasets/language_li/index.html create mode 100644 datasets/language_lmo/index.html create mode 100644 datasets/language_lo/index.html create mode 100644 datasets/language_lt/index.html create mode 100644 datasets/language_lv/index.html create mode 100644 datasets/language_mai/index.html create mode 100644 datasets/language_mg/index.html create mode 100644 datasets/language_mhr/index.html create mode 100644 datasets/language_min/index.html create mode 100644 datasets/language_mk/index.html create mode 100644 datasets/language_ml/index.html create mode 100644 datasets/language_mn/index.html create mode 100644 datasets/language_mr/index.html create mode 100644 datasets/language_mrj/index.html create mode 100644 datasets/language_ms/index.html create mode 100644 datasets/language_mt/index.html create mode 100644 datasets/language_multi/index.html create mode 100644 datasets/language_mwl/index.html create mode 100644 datasets/language_my/index.html create mode 100644 datasets/language_mzn/index.html create mode 100644 datasets/language_nah/index.html create mode 100644 datasets/language_nds/index.html create mode 100644 datasets/language_ne/index.html create mode 100644 datasets/language_new/index.html create mode 100644 datasets/language_nl/index.html create mode 100644 datasets/language_nn/index.html create mode 100644 datasets/language_no/index.html create mode 100644 datasets/language_ny/index.html create mode 100644 datasets/language_oc/index.html create mode 100644 datasets/language_om/index.html create mode 100644 datasets/language_or/index.html create mode 100644 datasets/language_os/index.html create mode 100644 datasets/language_pa/index.html create mode 100644 datasets/language_pl/index.html create mode 100644 datasets/language_pms/index.html create mode 100644 datasets/language_pnb/index.html create mode 100644 datasets/language_ps/index.html create mode 100644 datasets/language_pt/index.html create mode 100644 datasets/language_qu/index.html create mode 100644 datasets/language_ro/index.html create mode 100644 datasets/language_ru/index.html create mode 100644 datasets/language_rw/index.html create mode 100644 datasets/language_sa/index.html create mode 100644 datasets/language_sah/index.html create mode 100644 datasets/language_sd/index.html create mode 100644 datasets/language_sh/index.html create mode 100644 datasets/language_si/index.html create mode 100644 datasets/language_sk/index.html create mode 100644 datasets/language_sl/index.html create mode 100644 datasets/language_sn/index.html create mode 100644 datasets/language_so/index.html create mode 100644 datasets/language_sq/index.html create mode 100644 datasets/language_sr/index.html create mode 100644 datasets/language_st/index.html create mode 100644 datasets/language_su/index.html create mode 100644 datasets/language_sv/index.html create mode 100644 datasets/language_sw/index.html create mode 100644 datasets/language_ta/index.html create mode 100644 datasets/language_te/index.html create mode 100644 datasets/language_tg/index.html create mode 100644 datasets/language_th/index.html create mode 100644 datasets/language_ti/index.html create mode 100644 datasets/language_tk/index.html create mode 100644 datasets/language_tl/index.html create mode 100644 datasets/language_tr/index.html create mode 100644 datasets/language_tt/index.html create mode 100644 datasets/language_ug/index.html create mode 100644 datasets/language_uk/index.html create mode 100644 datasets/language_ur/index.html create mode 100644 datasets/language_uz/index.html create mode 100644 datasets/language_vi/index.html create mode 100644 datasets/language_vo/index.html create mode 100644 datasets/language_wa/index.html create mode 100644 datasets/language_war/index.html create mode 100644 datasets/language_wuu/index.html create mode 100644 datasets/language_x-eml/index.html create mode 100644 datasets/language_xal/index.html create mode 100644 datasets/language_xh/index.html create mode 100644 datasets/language_xmf/index.html create mode 100644 datasets/language_yi/index.html create mode 100644 datasets/language_yo/index.html create mode 100644 datasets/language_zh/index.html create mode 100644 datasets/language_zu/index.html create mode 100644 datasets/tokens_by_language.png create mode 100644 datasets/tokens_by_source.png create mode 100644 extract-text-data/index.html create mode 100644 getting-started/index.html create mode 100644 images/A_colorful_parrot_sitting_on_a_pile_of_books__whit-removebg-preview.png create mode 100644 images/data-schema.svg create mode 100644 images/favicon-16x16.png create mode 100644 images/favicon-32x32.png create mode 100644 images/favicon.ico create mode 100644 images/pipeline.svg create mode 100644 index.html create mode 100644 objects.inv create mode 100644 overview/index.html create mode 100644 related-work/index.html create mode 100644 search/search_index.json create mode 100644 sitemap.xml create mode 100644 sitemap.xml.gz diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 0000000..e69de29 diff --git a/404.html b/404.html new file mode 100644 index 0000000..d52da84 --- /dev/null +++ b/404.html @@ -0,0 +1,584 @@ + + + +
+ + + + + + + + + + + + + + + + + + +The first step for adding a new dataset is write a new dataset class. +If your data comes from a common source such as Huggingface, you can build upon existing abstractions.
+For example, Huggingface datasets only needed to specify some metadata like dataset ID, title etc. and the column where the textual data can be extracted from (by default text
column):
# my_datasets/pg19.py
+
+from llm_datasets.datasets.hf_dataset import HFDataset
+from llm_datasets.datasets.base import License, Availability
+
+class PG19Dataset(HFDataset):
+ DATASET_ID = "pg19"
+ TITLE = "Project Gutenberg books published before 1919"
+ HOMEPAGE = "https://huggingface.co/datasets/pg19"
+ LICENSE = License("Apache License Version 2.0 (or public domain?)", url="https://www.apache.org/licenses/LICENSE-2.0.html")
+ CITATION = r"""@article{raecompressive2019,
+ author = {Rae, Jack W and Potapenko, Anna and Jayakumar, Siddhant M and
+ Hillier, Chloe and Lillicrap, Timothy P},
+ title = {Compressive Transformers for Long-Range Sequence Modelling},
+ journal = {arXiv preprint},
+ url = {https://arxiv.org/abs/1911.05507},
+ year = {2019},
+ }
+ """ # noqa
+ AVAILIBILITY = Availability.DIRECT_DOWNLOAD
+
+ HF_DATASET_ID = "pg19"
+ HF_DATASET_SPLIT = "train"
+ streaming = True
+ text_column_name = "text"
+ title_column_name = "short_book_title"
+
Other datasets may require implementing the full text extraction logic. The example below reads text data from CSV files while excluding specific subsets:
+# my_datasets/csv_example.py
+
+import logging
+import pandas as pd
+from pathlib import Path
+from llm_datasets.datasets.base import BaseDataset, Availability, License
+
+logger = logging.getLogger(__name__)
+
+
+class CSVExampleDataset(BaseDataset):
+ DATASET_ID = "csv_example"
+ TITLE = "An example for a dataset from CSV files"
+ AVAILIBITY = Availability.ON_REQUEST
+ LANGUAGES = ["en"]
+ LICENSE = License("mixed")
+
+ def get_texts(self):
+ """
+ Extract texts from CSV files (format: "documen_id,text,score,url")
+ """
+ # Iterate over CSV files in raw dataset directory
+ for file_path in self.get_dataset_file_paths(needed_suffix=".csv"):
+ file_name = Path(file_path).name
+
+ if (
+ file_name.startswith("mc4_")
+ or file_name.startswith("colossal-oscar-")
+ or file_name.startswith("wikimedia")
+ ):
+ # skip subsets that overlap with other datasets (baes on file name)
+ continue
+
+ logger.info("Reading CSV: %s", file_path)
+ try:
+ # Use chunks to reduce memory consumption
+ for df in pd.read_csv(file_path, sep=",", chunksize=10_000):
+ for text in df.text.values:
+ # Pass extracted text
+ yield text
+ except ValueError as e:
+ logger.error("Error in file %s; error = %s", file_path, e)
+
Each dataset class needs to be registered with llm-datasets
such that the commands know what classes are available.
+This can be done by making a new Python module with a get_registered_dataset_classes
method that returns a list of dataset classes:
# my_datasets/dataset_registry.py
+from my_datasets.pg19 import PG19Dataset
+
+def get_registered_dataset_classes():
+ return [
+ PG19Dataset,
+ ]
+
To load the registerd datasets in the pipeline commands, you need to specify the --extra_dataset_registries
argument:
+ Bases: object
Base class for all datasets. It implements all generic loading, processing, and writing methods.
+ +src/llm_datasets/datasets/base.py
128 + 129 + 130 + 131 + 132 + 133 + 134 + 135 + 136 + 137 + 138 + 139 + 140 + 141 + 142 + 143 + 144 + 145 + 146 + 147 + 148 + 149 + 150 + 151 + 152 + 153 + 154 + 155 + 156 + 157 + 158 + 159 + 160 + 161 + 162 + 163 + 164 + 165 + 166 + 167 + 168 + 169 + 170 + 171 + 172 + 173 + 174 + 175 + 176 + 177 + 178 + 179 + 180 + 181 + 182 + 183 + 184 + 185 + 186 + 187 + 188 + 189 + 190 + 191 + 192 + 193 + 194 + 195 + 196 + 197 + 198 + 199 + 200 + 201 + 202 + 203 + 204 + 205 + 206 + 207 + 208 + 209 + 210 + 211 + 212 + 213 + 214 + 215 + 216 + 217 + 218 + 219 + 220 + 221 + 222 + 223 + 224 + 225 + 226 + 227 + 228 + 229 + 230 + 231 + 232 + 233 + 234 + 235 + 236 + 237 + 238 + 239 + 240 + 241 + 242 + 243 + 244 + 245 + 246 + 247 + 248 + 249 + 250 + 251 + 252 + 253 + 254 + 255 + 256 + 257 + 258 + 259 + 260 + 261 + 262 + 263 + 264 + 265 + 266 + 267 + 268 + 269 + 270 + 271 + 272 + 273 + 274 + 275 + 276 + 277 + 278 + 279 + 280 + 281 + 282 + 283 + 284 + 285 + 286 + 287 + 288 + 289 + 290 + 291 + 292 + 293 + 294 + 295 + 296 + 297 + 298 + 299 + 300 + 301 + 302 + 303 + 304 + 305 + 306 + 307 + 308 + 309 + 310 + 311 + 312 + 313 + 314 + 315 + 316 + 317 + 318 + 319 + 320 + 321 + 322 + 323 + 324 + 325 + 326 + 327 + 328 + 329 + 330 + 331 + 332 + 333 + 334 + 335 + 336 + 337 + 338 + 339 + 340 + 341 + 342 + 343 + 344 + 345 + 346 + 347 + 348 + 349 + 350 + 351 + 352 + 353 + 354 + 355 + 356 + 357 + 358 + 359 + 360 + 361 + 362 + 363 + 364 + 365 + 366 + 367 + 368 + 369 + 370 + 371 + 372 + 373 + 374 + 375 + 376 + 377 + 378 + 379 + 380 + 381 + 382 + 383 + 384 + 385 + 386 + 387 + 388 + 389 + 390 + 391 + 392 + 393 + 394 + 395 + 396 + 397 + 398 + 399 + 400 + 401 + 402 + 403 + 404 + 405 + 406 + 407 + 408 + 409 + 410 + 411 + 412 + 413 + 414 + 415 + 416 + 417 + 418 + 419 + 420 + 421 + 422 + 423 + 424 + 425 + 426 + 427 + 428 + 429 + 430 + 431 + 432 + 433 + 434 + 435 + 436 + 437 + 438 + 439 + 440 + 441 + 442 + 443 + 444 + 445 + 446 + 447 + 448 + 449 + 450 + 451 + 452 + 453 + 454 + 455 + 456 + 457 + 458 + 459 + 460 + 461 + 462 + 463 + 464 + 465 + 466 + 467 + 468 + 469 + 470 + 471 + 472 + 473 + 474 + 475 + 476 + 477 + 478 + 479 + 480 + 481 + 482 + 483 + 484 + 485 + 486 + 487 + 488 + 489 + 490 + 491 + 492 + 493 + 494 + 495 + 496 + 497 + 498 + 499 + 500 + 501 + 502 + 503 + 504 + 505 + 506 + 507 + 508 + 509 + 510 + 511 + 512 + 513 + 514 + 515 + 516 + 517 + 518 + 519 + 520 + 521 + 522 + 523 + 524 + 525 + 526 + 527 + 528 + 529 + 530 + 531 + 532 + 533 + 534 + 535 + 536 + 537 + 538 + 539 + 540 + 541 + 542 + 543 + 544 + 545 + 546 + 547 + 548 + 549 + 550 + 551 + 552 + 553 + 554 + 555 + 556 + 557 + 558 + 559 + 560 + 561 + 562 + 563 + 564 + 565 + 566 + 567 + 568 + 569 + 570 + 571 + 572 + 573 + 574 + 575 + 576 + 577 + 578 + 579 + 580 + 581 + 582 + 583 + 584 + 585 + 586 + 587 + 588 + 589 + 590 + 591 + 592 + 593 + 594 + 595 + 596 + 597 + 598 + 599 + 600 + 601 + 602 + 603 + 604 + 605 + 606 + 607 + 608 + 609 + 610 + 611 + 612 + 613 + 614 + 615 + 616 + 617 + 618 + 619 + 620 + 621 + 622 + 623 + 624 + 625 + 626 + 627 + 628 + 629 + 630 + 631 + 632 + 633 + 634 + 635 + 636 + 637 + 638 + 639 + 640 + 641 + 642 + 643 + 644 + 645 + 646 + 647 + 648 + 649 + 650 + 651 + 652 + 653 + 654 + 655 + 656 + 657 + 658 + 659 + 660 + 661 + 662 + 663 + 664 + 665 + 666 + 667 + 668 + 669 + 670 + 671 + 672 + 673 + 674 + 675 + 676 + 677 + 678 + 679 + 680 + 681 + 682 + 683 + 684 + 685 + 686 + 687 + 688 + 689 + 690 + 691 + 692 + 693 + 694 + 695 + 696 + 697 + 698 + 699 + 700 + 701 + 702 + 703 + 704 + 705 + 706 + 707 + 708 + 709 + 710 + 711 + 712 + 713 + 714 + 715 + 716 + 717 + 718 + 719 + 720 + 721 + 722 + 723 + 724 + 725 + 726 + 727 + 728 + 729 + 730 + 731 + 732 + 733 + 734 + 735 + 736 + 737 + 738 + 739 + 740 + 741 + 742 + 743 + 744 + 745 + 746 + 747 + 748 + 749 + 750 + 751 + 752 + 753 + 754 + 755 + 756 + 757 + 758 + 759 + 760 + 761 + 762 + 763 + 764 + 765 + 766 + 767 + 768 + 769 + 770 + 771 + 772 + 773 + 774 + 775 + 776 + 777 + 778 + 779 + 780 + 781 + 782 + 783 + 784 + 785 + 786 + 787 + 788 + 789 + 790 + 791 + 792 + 793 + 794 + 795 + 796 + 797 + 798 + 799 + 800 + 801 + 802 + 803 + 804 + 805 + 806 + 807 + 808 + 809 + 810 + 811 + 812 + 813 + 814 + 815 + 816 + 817 + 818 + 819 + 820 + 821 + 822 + 823 + 824 + 825 + 826 + 827 + 828 + 829 + 830 + 831 + 832 + 833 + 834 + 835 + 836 + 837 + 838 + 839 + 840 + 841 + 842 + 843 + 844 + 845 + 846 + 847 + 848 + 849 + 850 + 851 + 852 + 853 + 854 + 855 + 856 + 857 + 858 + 859 + 860 + 861 + 862 + 863 + 864 + 865 + 866 + 867 + 868 + 869 + 870 + 871 + 872 + 873 + 874 + 875 + 876 + 877 + 878 + 879 + 880 + 881 + 882 + 883 + 884 + 885 + 886 + 887 + 888 + 889 + 890 + 891 + 892 + 893 + 894 + 895 + 896 + 897 + 898 + 899 + 900 + 901 + 902 + 903 + 904 + 905 + 906 + 907 + 908 + 909 + 910 + 911 + 912 + 913 + 914 + 915 + 916 + 917 + 918 + 919 + 920 + 921 + 922 + 923 + 924 + 925 + 926 + 927 + 928 + 929 + 930 + 931 + 932 + 933 + 934 + 935 + 936 + 937 + 938 + 939 + 940 + 941 + 942 + 943 + 944 + 945 + 946 + 947 + 948 + 949 + 950 + 951 + 952 + 953 + 954 + 955 + 956 + 957 + 958 + 959 + 960 + 961 + 962 + 963 + 964 + 965 + 966 + 967 + 968 + 969 + 970 + 971 + 972 + 973 + 974 + 975 + 976 + 977 + 978 + 979 + 980 + 981 + 982 + 983 + 984 + 985 + 986 + 987 + 988 + 989 + 990 + 991 + 992 + 993 + 994 + 995 + 996 + 997 + 998 + 999 +1000 +1001 +1002 +1003 +1004 +1005 +1006 +1007 +1008 +1009 +1010 |
|
filter_documents(documents)
+
+Applies basic filtering on the texts before saving
+ +src/llm_datasets/datasets/base.py
filter_texts(texts)
+
+Applies basic filtering on the texts before saving
+ +src/llm_datasets/datasets/base.py
generate_texts_from_output(shuffled=False, batch_size=None, limit=0, offset=0, shuffle_output_file_paths=False, reader_implementation='pyarrow', cast_to_py_string=False)
+
+A iterator over texts from processed output files.
+ +src/llm_datasets/datasets/base.py
698 +699 +700 +701 +702 +703 +704 +705 +706 +707 +708 +709 +710 +711 +712 +713 +714 +715 +716 +717 +718 +719 +720 +721 +722 +723 +724 +725 +726 +727 +728 +729 +730 +731 +732 +733 +734 +735 +736 +737 +738 +739 +740 +741 +742 +743 +744 +745 +746 +747 +748 +749 +750 +751 +752 +753 +754 +755 +756 +757 +758 +759 +760 +761 +762 +763 +764 +765 +766 +767 +768 +769 +770 +771 +772 +773 +774 +775 +776 +777 +778 +779 +780 +781 +782 +783 +784 +785 +786 +787 +788 +789 +790 +791 +792 +793 +794 +795 +796 +797 +798 +799 +800 +801 +802 +803 +804 +805 +806 +807 +808 +809 +810 +811 +812 +813 +814 +815 +816 +817 +818 +819 +820 +821 +822 +823 +824 +825 +826 +827 +828 +829 +830 +831 +832 +833 +834 +835 +836 +837 +838 +839 +840 +841 +842 +843 +844 +845 +846 +847 +848 +849 +850 +851 +852 +853 +854 +855 +856 +857 +858 +859 +860 +861 +862 +863 +864 +865 +866 +867 +868 +869 +870 +871 +872 +873 +874 +875 +876 +877 +878 +879 +880 +881 +882 +883 +884 +885 +886 +887 +888 +889 +890 +891 +892 +893 +894 +895 +896 +897 +898 +899 +900 +901 +902 +903 +904 |
|
get_compression_from_output_files(shuffled=False)
+
+NOTE: Currently only implemented for parquet
format.
src/llm_datasets/datasets/base.py
get_estimated_bytes_from_output(shuffled=False, read_first_n_rows=1000)
+
+Estimate byte size of output text: +- read first N rows of shuffled output files and count their byte size +- multiply counted bytes by total number of rows
+ +src/llm_datasets/datasets/base.py
get_output_rows_count(shuffled=False)
+
+Read metadata from parquet files and extract number of rows
+ +src/llm_datasets/datasets/base.py
get_sampling_factor()
+
+Sampling is defined based on dataset ID, source ID, or language.
+ +src/llm_datasets/datasets/base.py
is_selected()
+
+Is this dataset part of selected datasets or sources?
+ +src/llm_datasets/datasets/base.py
save_stats()
+
+Save the processing statistics (counter) into a JSON file in the output directory.
+ +src/llm_datasets/datasets/base.py
save_texts(texts, append=False)
+
+Save texts in different formats
+ +src/llm_datasets/datasets/base.py
save_texts_to_jsonl(texts, append=False)
+
+Write JSONL files to
src/llm_datasets/datasets/base.py
save_texts_to_parquet(texts, file_path=None, apply_filter=True)
+
+Save text in parquet (single column schema, in batches)
+ +src/llm_datasets/datasets/base.py
+ Bases: object
src/llm_datasets/utils/config.py
64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 |
|
get_job_id()
+
+Returns manually set job ID or from environment variable (SLURM_JOBID)
+ +src/llm_datasets/utils/config.py
+ Bases: BaseDocumentDataset
src/llm_datasets/datasets/hf_dataset.py
12 + 13 + 14 + 15 + 16 + 17 + 18 + 19 + 20 + 21 + 22 + 23 + 24 + 25 + 26 + 27 + 28 + 29 + 30 + 31 + 32 + 33 + 34 + 35 + 36 + 37 + 38 + 39 + 40 + 41 + 42 + 43 + 44 + 45 + 46 + 47 + 48 + 49 + 50 + 51 + 52 + 53 + 54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 |
|
+ Bases: JSONLMixin
, BaseTextDataset
src/llm_datasets/datasets/jsonl_dataset.py
54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 |
|
get_document_from_item(item)
+
+This simply returns the document with a text field from item (but dataset classes can override this to implement filtering etc.)
+ +src/llm_datasets/datasets/jsonl_dataset.py
get_text_from_item(item)
+
+This simply returns the text field from item (but dataset classes can override this to implement filtering etc.)
+ + +get_texts()
+
+Iterate over all input files and read JSON from each line.
+ + +get_texts_with_multi_proc()
+
+Iterate over all input files in parallel and read JSON from each line.
+ + +get_texts_with_single_proc()
+
+Iterate over all input files and read JSON from each line.
+ +src/llm_datasets/datasets/jsonl_dataset.py